Ejemplo n.º 1
0
    def spider(self):
        if os.path.exists("temp_ts.dat"):
            # 删除文件,可使用以下两种方法。
            os.remove("temp_ts.dat")
        self.create_table()
        status = True
        i = 0
        city_code = self.ids.get_nowait()
        while status:
            url = self.base_url % (city_code, i * 10, self.keyword)
            # print url
            downloader = PageDownload()
            page = downloader.simple_download(url)
            if page:
                json_data = json.loads(page)
                if json_data.has_key("data"):
                    total_count = json_data["data"]["totalCount"]
                    if int(total_count) > 0:

                        self.download_page_info(json_data["data"])
                        status = True
                else:
                    if not self.ids.empty():
                        city_code = self.ids.get_nowait()
                        continue
                    else:
                        status = False
                i = i + 1
                if i == self.num:
                    return

            else:
                pass
Ejemplo n.º 2
0
 def vertify_proxy(self):
     for ip in self.ips:
         proxy = {'http': 'http://' + ip}
         downloader = PageDownload(proxy)
         page = downloader.simple_download(self.test_url)
         if page is not None:
             print ip
             self.avaliable_ips.append(ip)
Ejemplo n.º 3
0
 def download_list_urls(self, url, filter_arg, next_url_filter=None):
     downloader = PageDownload()
     page = downloader.simple_download(url)
     company_host_list = re.findall(filter_arg, page)
     if next_url_filter:
         next_url = re.findall(next_url_filter, page)
         if next_url:
             next_url = next_url[0]
     else:
         next_url = None
     return company_host_list, next_url
Ejemplo n.º 4
0
 def get_phone_info(self, phone_num):
     url = "http://v.showji.com/Locating/showji.com20180331.aspx?&output=json&&m=" + phone_num
     downloader = PageDownload()
     page = downloader.simple_download(url)
     if page is None:
         return None
     json_data = json.loads(page)
     # s = type(json_data)
     if json_data["QueryResult"] == "True":
         return [json_data["Province"], json_data["City"], json_data["Corp"]]
     else:
         return None
Ejemplo n.º 5
0
 def download_list_urls(self, url):
     downloader = PageDownload()
     page = downloader.simple_download(url)
     json_data = json.loads(page)
     content = json_data["data"]["content"]
     urls = []
     if content.has_key("offerResult"):
         items = content["offerResult"]
         for item in items:
             id = item["offerid"]
             _url = "http://detail.1688.com/offer/%s.html" % (id)
             urls.append(_url)
     return urls
Ejemplo n.º 6
0
 def collect_proxy(self):
     downloader = PageDownload()
     page = downloader.simple_download(self.proxy_url)
     if page is not None:
         if page.find('table') and page.find('td') and page.find('tr'):
             ip_list = re.findall(self.reg_table, page)
             for ip_ in ip_list:
                 ip = ip_[0] + ":" + ip_[1]
                 self.ips.append(ip)
         else:
             ip_list = re.findall(self.reg, page)
             for ip in ip_list:
                 self.ips.append(ip)
Ejemplo n.º 7
0
    def download_company_page(self, url):
        downloader = PageDownload()
        page = downloader.simple_download(url)
        if not page:
            return False

        phone_num = re.findall(self.hq_phoneNum_filter, page)
        contact = re.findall(self.hq_contact_filter, page)
        company_name = re.findall(self.hq_companyName_filter, page)
        address = re.findall(self.hq_address_filter, page)
        if company_name:
            company_name = company_name[0]
        else:
            company_name = "未知"
        if address:
            address = address[0]
        else:
            address = "未知"
        if contact:
            contact = contact[0]
        else:
            contact = "未知"
        if phone_num:
            phone_num = phone_num[0]
        else:
            return False
        try:
            with open("temp_tt.dat", "a") as f:
                f.write(phone_num + "*" + contact + "\n")
        except:
            pass

        res = self.get_phone_info(phone_num)
        if res:
            province = res[0]
            city = res[1]
            yys = res[2]
        else:
            province = "未知"
            city = "未知"
            yys = "未知"
        print url, phone_num, contact, company_name, address
        db = SqlLiteHandle()
        sql = "insert into tt_1 values (?,?,?,?,?,?,?)"
        db.insert(
            sql,
            [(phone_num, contact, company_name, address, province, city, yys)])
        db.close()
        time.sleep(0.2)
        return True
Ejemplo n.º 8
0
    def get_phone_info(self, phone_num):
        url = "http://v.showji.com/Locating/showji.com20180331.aspx?&output=json&&m=" + phone_num
        downloader = PageDownload()
        page = downloader.simple_download(url)
        if page is None:
            taobao_api = "http://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel=" + phone_num
            downloader = PageDownload()
            page = downloader.simple_download(taobao_api)
            if page:
                province = re.findall(r"province:'([^:]+)',", page)
                yys = re.findall(r"catName:'([^:]+)',", page)
                city = None
                return [province, city, yys]
            else:
                return None

        else:
            json_data = json.loads(page)

            # s = type(json_data)
            if json_data["QueryResult"] == "True":
                return [
                    json_data["Province"], json_data["City"], json_data["Corp"]
                ]
            else:
                return None
Ejemplo n.º 9
0
    def spider(self):
        if os.path.exists("temp_ts.dat"):
            # 删除文件,可使用以下两种方法。
            os.remove("temp_ts.dat")
        self.create_table()
        status = True
        i = 0
        while status:
            url = self.base_url % (self.keyword, i)
            downloader = PageDownload()
            page = downloader.simple_download(url)
            if page:
                json_data = json.loads(page)
                if json_data.has_key("poi"):
                    self.download_page_info(json_data)
                    status = True
                else:
                    status = False
                i = i + 1
                if i == self.num:
                    return

            else:
                pass
Ejemplo n.º 10
0
    def spider(self):
        if os.path.exists("temp_ts.dat"):
            # 删除文件,可使用以下两种方法。
            os.remove("temp_ts.dat")
        self.create_table()
        status = True
        i = 0
        id_i = 0
        key = amap_api_key[id_i]
        while status:
            url = self.base_url_nm % (i, self.keyword)
            downloader = PageDownload()
            page = downloader.simple_download(url)
            if page:
                json_data = json.loads(page)
                if json_data.has_key("status"):
                    status = json_data["status"]
                    if status != "1":
                        url = self.base_url % (self.keyword, i, key)
                        downloader = PageDownload()
                        page = downloader.simple_download(url)
                        if page:
                            json_data = json.loads(page)
                            if json_data.has_key("status"):
                                status = json_data["status"]
                                if status != "1":
                                    if id_i < 16:
                                        id_i = id_i + 1
                                    else:
                                        id_i = 0
                                    key = amap_api_key[id_i]
                                    continue
                                else:
                                    self.download_page_info(json_data)

                    else:
                        self.download_page_info_nm(json_data)
                    status = True
                else:
                    status = False
                i = i + 1
                if i == self.num:
                    return

            else:
                pass