Example #1
0
    def _generate_seed_url(self):
        """
        generate all url to visit
        """

        # from page 1 to anypage which < 200

        # 从数据库添加
        global Dao
        Dao = Dao()
        self._seed_url = Dao._get_url_by_id(self.source_id)
Example #2
0
 def _video_dao(self):
     dao = Dao()
     #表中是否已有记录
     query_sql = "SELECT * FROM merchant WHERE url = '{}'".format(
         self.__ne_detail["url"])
     if dao.execute_query(query_sql) is not None:
         print(" {} is already exists ,so next".format(
             self.__ne_detail["name"]))
         return
     #数据插入操作
     dao.execute_dmls(self._insert_merchant())
 def __init__(self,
              LT1,
              LG1,
              LT2,
              LG2,
              cityname,
              cityid,
              cityenname,
              name,
              shopId=0,
              categoryId=0):
     threading.Thread.__init__(self, name=name)
     self.cityid = cityid
     self.shopId = shopId
     self.categoryId = categoryId
     self.Lat1 = LT1
     self.Lat2 = LT2
     self.Long1 = LG1
     self.Long2 = LG2
     self.city_name = cityname
     self.values = {
         'promoId': '0',
         'shopType': '',
         'categoryId': '',
         'sortMode': '2',
         'shopSortItem': '1',
         'keyword': '',
         'searchType': '1',
         'branchGroupId': '0',
         'shippingTypeFilterValue': '0',
         'page': '1'
     }
     self.values["cityId"] = cityid
     self.values["cityEnName"] = cityenname
     self.url = "http://www.dianping.com/search/map/ajax/json"
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
         'Referer':
         'http://www.dianping.com/search/map/category/{}/0'.format(cityid)
     }
     self.dao = Dao()
     self.query_sql = "SELECT shopType ,categoryId,NAME FROM category  WHERE categoryId <> shopType AND categoryId <>'None'  "
     self.result = self.dao.execute_query(self.query_sql)
     self.query_sql2 = "SELECT shopId FROM shop_bean where  city_name ='{}'".format(
         cityname)
     self.result2 = self.dao.execute_query(self.query_sql2)
     self.shopIds = []
     if self.result2 is not None:
         for shopid in self.result2:
             self.shopIds.append(shopid[0])
Example #4
0
    def start(self):

        if self._print_status:
            print('start mining #{} ...'.format(self._hastag))
        dao = Dao()
        timeStamp_tweet_list = dao.select(
            '*', "manager",
            "hastag = '{}' ORDER BY 'timeStamp'".format(self._hastag))
        lastTweetTimeStamp = int(timeStamp_tweet_list[-1]['timeStamp'])

        if lastTweetTimeStamp > self.START_TIMESTAMP:
            self.mining(lastTweetTimeStamp, self.FINISH_TIMESTAMP)

        else:
            self.mining(self.START_TIMESTAMP, self.FINISH_TIMESTAMP)
    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        for single_url in seed_url:
            # # 获取html源代码
            # html = self.get_page_content_str(single_url)
            #
            # #使用哪个方法进行分析
            # self._extract_data(html)

            # dao=Dao()
            # insert_sql =" INSERT INTO fetch_list (source_id, url,times,page,STATUS) VALUE(99,'{}',0,0,0)".format(single_url)
            # dao.execute_dmls(insert_sql)

            dao = Dao()
            update_sql = "   UPDATE  fetch_list2 SET  times = times+1 WHERE url = '{}'and source_id = 98 ".format(
                single_url[0])
            dao.execute_dmls(update_sql)
            self._now_url = single_url[0]
            html = self.get_page_content_str(single_url[0])
            self._extract_data2(html)
Example #6
0
from DAO import Dao
import sys
from extract_tweet_ids import HastagMining
import time
from tweet_from_id import TweetFromID
from models import Tweet,User
from sentiment_analyze import Linguakit
from utils import printError, limparTexto

s1 = sys.argv[1]

if s1 == 'newHashtag':

    candidato, hastag =  sys.argv[2], sys.argv[3]

    dao = Dao()
    r = dao.insert('manager', ['hastag', 'idTweet', 'idCandidato', 'timeStamp'],[hastag, '0', candidato, 0])

    print(">>> new hastag saved for mining use 'main.py mineHashtag {} {} {}'".format(candidato, hastag, 'p'))

elif s1 == 'mineHashtag':

    candidato, hastag =  sys.argv[2], sys.argv[3]
    p = False
    if sys.argv[4] == 'p':
        p = True
    mining = HastagMining(hastag, candidato, print_status = p)
    mining.start()

if s1 =='mineTweet':
    arq = open('Last_id.txt','r')
                             display_name,logo,COMMUNITY_ID)
                VALUES ('',
                        '{}',
                        '{}',
                        '',
                        '{}',
                        '',
                        '{}',
                        '{}',
                        '{}',
                        '',
                        '{}',
                        '{}',
                        '{}',
                        '{}');'''.format(merchant['business_name'],
                                         merchant['phone_number'],
                                         merchant['address'], '',
                                         merchant['lat'], merchant['long'],
                                         merchant['sub_title'], param['cat'],
                                         merchant['logo'], COMMUNITY_ID)
                dao.execute_dmls(insertSql)
        except:
            pass


if __name__ == '__main__':
    querySql = 'SELECT a.name ,a.COMMUNITY_ID ,b.LATITUDE,b.LONGITUDE FROM  shengchan_20140815.communities a ,shengchan_20140815.community_poses b WHERE a.COMMUNITY_ID =b.COMMUNITY_ID AND a.AREA_ID <11 AND a.AREA_ID > 0 '
    dao = Dao()
    result = dao.execute_query(querySql)
    for name, COMMUNITY_ID, LATITUDE, LONGITUDE in result:
        getmerchants(LATITUDE, LONGITUDE, COMMUNITY_ID)
class CommunitiesListCrawler(BaseCrawler, threading.Thread):
    global Dao
    Dao = Dao()

    def __init__(self, page_num):
        # TO
        threading.Thread.__init__(self, name=page_num)
        super().__init__(self)
        self.detail_info_urls = []
        self.source_id = 30
        self.min_page = page_num * 30 + 1
        self.max_page = page_num * 30 + 31
        self._base_url = "http://newhouse.hfhome.cn/"
        self._community_detail = {
            'url': '',
            'name': '',
            'location': '',
            'area_name': '',
            'description': '',
            'latitude': '',
            'longitude': ''
        }

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """
        # 单个url
        html = self.get_page_content_str(seed_url)
        self.findEachBuilding(html)


    def findEachBuilding(self, html):
        doc = Pq(html)
        tr_list = doc("table#GVFwxkz>tr")
        for tr in tr_list:
            name = Pq(tr)("td:eq(1)").text()
            self._community_detail["name"] = name
            href = doc(tr).find("td>a").attr("href")
            if href == None:
                continue
            href = href[href.index("?"):]
            url = "http://newhouse.hfhome.cn/Modal/RoomList.aspx" + href
            if self._check_community(url):
                print(url + "     ---    已经爬取过了")
                continue
            self._community_detail["url"] = url
            self._extract_data(url)

    def _extract_data(self, url):
        community_id = self._save_community()
        doc_str = self.get_page_content_str(url)
        doc = Pq(doc_str)
        tr_list = doc("table>tr")
        try:
            for tr in tr_list:
                Floor_num = Pq(tr)("td:eq(0)").text()
                a_list = doc(tr).find("td.preview>a")
                for a in a_list:
                    apartment_detail = {
                        'COMMUNITY_ID': community_id,
                        'FLOOR_NUM': Floor_num,
                        'APARTMENT_NUM': doc(a).text(),
                        'STATUS': '2',
                        'create_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                    }
                    self._save_apartment(apartment_detail)
            sql = "update communities set status = '2' where ORIGINAL_URL = '{}' ; ".format(url)
            Dao.execute_dmls(sql)
        except Exception as  e:
            print(e)
            sql = "update communities set status = -1 where ORIGINAL_URL = '{}' ; ".format(url)
            Dao.execute_dmls(sql)

    def _insert_community(self):
        result = "INSERT INTO  communities (ORIGINAL_URL,NAME,AREA_NAME,LATITUDE,LONGITUDE,address,source_id )" \
                 " VALUES ('{}','{}','{}','{}','{}' ,'{}','{}' )".format(self._community_detail["url"],
                                                                         self._community_detail["name"],
                                                                         self._community_detail["area_name"],
                                                                         self._community_detail["latitude"],
                                                                         self._community_detail["longitude"],
                                                                         self._community_detail["location"],
                                                                         self.source_id)
        return result


    def _insert_apartment(self, apartment_detail):
        result = "INSERT INTO  apartments (COMMUNITY_ID  , APARTMENT_NUM ,STATUS ,FLOOR_NUM,create_time  )" \
                 " VALUES ('{}','{}','{}','{}','{}'  )".format(apartment_detail["COMMUNITY_ID"],
                                                               apartment_detail["APARTMENT_NUM"],
                                                               apartment_detail["STATUS"],
                                                               apartment_detail["FLOOR_NUM"],
                                                               apartment_detail["create_time"])
        return result


    def _save_apartment(self, apartment_detail):
        # 表中是否已有记录
        query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID  = {}  and FLOOR_NUM ='{}'  and APARTMENT_NUM ='{}' ".format(
            int(apartment_detail["COMMUNITY_ID"]), apartment_detail["FLOOR_NUM"],
            apartment_detail["APARTMENT_NUM"])
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(str(apartment_detail["COMMUNITY_ID"]) +
                                                          apartment_detail["FLOOR_NUM"] +
                                                          apartment_detail["APARTMENT_NUM"]))
            return
        # 数据插入操作
        try:
            Dao.execute_dmls(self._insert_apartment(apartment_detail))
        except Exception as e:
            print(e)


    def _save_community(self):
        # 表中是否已有记录
        query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format(
            self._community_detail["url"], self.source_id)
        communityid_sql = "SELECT COMMUNITY_ID FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format(
            self._community_detail["url"], self.source_id)
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(self._community_detail["name"]))
            return Dao.execute_query(communityid_sql)[0][0]
        # 数据插入操作
        Dao.execute_dmls(self._insert_community())
        return Dao.execute_query(communityid_sql)[0][0]

    def _check_community(self, url):
        # 表中是否已有记录 完成的
        communityid_sql = "SELECT COMMUNITY_ID FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} and status = 2 ".format(
            url, self.source_id)
        result = Dao.execute_query(communityid_sql)
        if result == None:
            return False
        return True


    def run(self):
        # for i in range(self.min_page, self.max_page):
        for i in range(363, 397):
            url = "http://newhouse.hfhome.cn/hffd_xkz.aspx?page={}".format(i)
            self._visit_pages(url)
Example #9
0
 def __init__(self):
     self._conn = sqlite3.connect('database.db')
     self.vaccines = Dao(DTO.Vaccine, self._conn)
     self.suppliers = Dao(DTO.Supplier, self._conn)
     self.clinics = Dao(DTO.Clinic, self._conn)
     self.logistics = Dao(DTO.Logistic, self._conn)
Example #10
0
class CommunitiesListCrawler(BaseCrawler):
    global Dao
    Dao = Dao()

    def __init__(self):
        # TODO 用参数化和多线程来执行抓取

        super().__init__()
        self.detail_info_urls = []
        self.source_id = 21
        self._base_url = "http://www.njhouse.com.cn/spf/inf/"
        self._root_url = "http://www.njhouse.com.cn/spf/inf/index.php"
        self._apartment_detail = {
            'COMMUNITY_ID': 0,
            'BUILDING_NUM': '',
            'APARTMENT_NUM': '',
            'STATUS': '2',
            'create_time': ''
        }

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        # for single_url in seed_url:
        # update_sql = "   UPDATE  fetch_list SET  times = times+1 WHERE url = '{}'and source_id =17".format(
        # single_url[0])
        # Dao.execute_dmls(update_sql)
        # self._base_url = single_url[0]
        # self._now_url = single_url[0]
        # html = self.get_page_content_str(single_url[0])
        #     try:
        #         self._extract_data(html)
        #     except Exception as e:
        #         print(e)
        #         update_sql = "   UPDATE  fetch_list SET  status  = 1 WHERE url = '{}'and source_id =17".format(
        #             single_url[0])
        #         Dao.execute_dmls(update_sql)

        # 单个url
        # html = self.get_page_content_str(self._seed_url[0]) #用数据库的时候
        seed_url = self._root_url + seed_url[seed_url.rindex("?"):]
        html = self.get_page_content_str(seed_url)  #单个URL
        self.findEachBuilding(html)
        # b = set(self._resualt)
        # self._resualt=[i for  i in b]
        # # dao=Dao()
        # insert_sql=""
        # for res1 in b :
        # insert_sql = "INSERT INTO merchant_tmp (description,url )VALUES ( '{}', 'http://www.youlin.me/category/407')".format(res1)
        # print( insert_sql  )
        # dao = Dao()
        # dao.execute_dmls(insert_sql)

    def get_page_content_str(self, url):

        try:
            print("现在开始抓取" + url)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
            }
            request = urllib.request.Request(url=url, headers=headers)
            m_fp = urllib.request.urlopen(request, timeout=1500)
            html_str_uncode = m_fp.read()
            if html_str_uncode == '':
                print("出问题了,没出来数据")
                return self.get_page_content_str(url)
            m_fp.close()
            return html_str_uncode
        except urllib.error.URLError as err:
            return None
        except Exception as err:
            print(err)
            return None

    def _generate_seed_url(self):
        """
        generate all url to visit
        """
        # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510"
        # self._visit_pages(self._seed_url)
        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        querysql = "SELECT COMMUNITY_ID,ORIGINAL_URL FROM communities WHERE   source_id ='{}' and status= -1 ; ".format(
            self.source_id)
        result = Dao.execute_query(querysql)
        for COMMUNITY_ID, ORIGINAL_URL in result:
            try:
                self._apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID)
                self._apartment_detail["create_time"] = time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                self._visit_pages(ORIGINAL_URL)
                sql = "update communities set status = '2' where COMMUNITY_ID = '{}' ".format(
                    int(COMMUNITY_ID))
                Dao.execute_dmls(sql)
            except Exception as e:
                print(e)
                sql = "update communities set status = '-1' where COMMUNITY_ID = '{}' ".format(
                    int(COMMUNITY_ID))
                Dao.execute_dmls(sql)

                # 直接加,测试
                # self._seed_url.append(self._base_url)

    def findEachBuilding(self, html):
        doc = Pq(html)
        a_list = doc("table>tr>td.text>a")
        for a in a_list:
            self._apartment_detail["BUILDING_NUM"] = doc(a).text()
            url = self._base_url + doc(a).attr("href")
            doc_str = self.get_page_content_str(url)
            self._extract_data(doc_str)
            time.sleep(5)

    def _extract_data(self, doc_str):
        try:
            doc = Pq(doc_str)
            a_list = doc("tr.text>td>a")
            # total_item =int( doc("").text().strip())
            # count_num = int(total_item) / 12
            for a in a_list:
                self._apartment_detail["APARTMENT_NUM"] = doc(a).text()
                if self._apartment_detail["APARTMENT_NUM"].strip() != '':
                    self._apartment_detail["create_time"] = time.strftime(
                        '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                    self._save_community()
        except Exception as err:
            print(err)
            time.sleep(100)
            self._extract_data(doc_str)

    def _insert_community(self):
        result = "INSERT INTO  apartments (COMMUNITY_ID , BUILDING_NUM , APARTMENT_NUM ,STATUS ,create_time  )" \
                 " VALUES ('{}','{}','{}','{}','{}'  )".format(self._apartment_detail["COMMUNITY_ID"],
                                                               self._apartment_detail["BUILDING_NUM"],
                                                               self._apartment_detail["APARTMENT_NUM"],
                                                               self._apartment_detail["STATUS"],
                                                               self._apartment_detail["create_time"])
        return result

    def _save_community(self):
        # 表中是否已有记录
        query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID  = {}  and BUILDING_NUM ='{}'  and APARTMENT_NUM ='{}' ".format(
            int(self._apartment_detail["COMMUNITY_ID"]),
            self._apartment_detail["BUILDING_NUM"],
            self._apartment_detail["APARTMENT_NUM"])
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(
                str(self._apartment_detail["COMMUNITY_ID"]) +
                self._apartment_detail["BUILDING_NUM"] +
                self._apartment_detail["APARTMENT_NUM"]))
            return
        # 数据插入操作
        try:
            Dao.execute_dmls(self._insert_community())
        except Exception as e:
            print(e)

    def craw(self):
        self._generate_seed_url()
class CommunitiesListCrawler(BaseCrawler):
    global Dao
    Dao = Dao()

    def craw(self):
        self._generate_seed_url()

    def __init__(self):
        # TODO 用参数化和多线程来执行抓取

        super().__init__()
        self.detail_info_urls = []
        self.source_id = 18
        self._base_url = "http://bj.5i5j.com"
        self._community_detail = {
            'url': '',
            'name': '',
            'location': '',
            'area_name': '',
            'description': '',
            'latitude': '',
            'longitude': '',
            'city': ''
        }

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        # for single_url in seed_url:
        # update_sql = "   UPDATE  fetch_list SET  times = times+1 WHERE url = '{}'and source_id =17".format(
        # single_url[0])
        # Dao.execute_dmls(update_sql)
        # self._base_url = single_url[0]
        # self._now_url = single_url[0]
        #     html = self.get_page_content_str(single_url[0])
        #     try:
        #         self._extract_data(html)
        #     except Exception as e:
        #         print(e)
        #         update_sql = "   UPDATE  fetch_list SET  status  = 1 WHERE url = '{}'and source_id =17".format(
        #             single_url[0])
        #         Dao.execute_dmls(update_sql)

        # 单个url
        html = self.get_page_content_str(seed_url)
        self.findEachArea(html)
        # b = set(self._resualt)
        # self._resualt=[i for  i in b]
        # # dao=Dao()
        # insert_sql=""
        # for res1 in b :
        # insert_sql = "INSERT INTO merchant_tmp (description,url )VALUES ( '{}', 'http://www.youlin.me/category/407')".format(res1)
        # print( insert_sql  )
        # dao = Dao()
        # dao.execute_dmls(insert_sql)

    def _generate_seed_url(self):
        """
        generate all url to visit
        """

        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        # 直接加,测试
        self._visit_pages("http://www.llzg.cn/nhd/neig44000000.html")

    def findEachArea(self, HTML):
        doc = Pq(HTML)
        citys = doc(
            "body > div > div.container > div > div > div.branchNavWrap")
        for city in citys:
            self._community_detail['city'] = doc(city).find(
                "div.title3>h4").text()
            areas = doc(city).find("div.branchNav>a")
            for area in areas:
                self._community_detail['area_name'] = doc(area).text()
                url = doc(area).attr('href') + '/plot/plotlist'
                self._extract_data(url)

    def _extract_data(self, url):
        doc_str = self.get_page_content_str(url)

        doc = Pq(doc_str)
        communities = doc(
            "body > div.container.marginTop10px > div > div.span9 > div.areasWrap > div.areas > dl.clearfix > dd > a"
        )
        for community in communities:
            self._community_detail['name'] = doc(community).text()
            self._save_community()

    def _insert_community(self):
        result = "INSERT INTO  ehdc.communities_llzg (ORIGINAL_URL,NAME,AREA_NAME,LATITUDE,LONGITUDE,city_name )" \
                 " VALUES ('{}','{}','{}','{}','{}' ,'{}' )".format(self._community_detail["url"],
                                                                    self._community_detail["name"],
                                                                    self._community_detail["area_name"],
                                                                    '',
                                                                    '',
                                                                    self._community_detail['city'])
        return result

    def _save_community(self):
        # 表中是否已有记录
        query_sql = "SELECT * FROM ehdc.communities_llzg WHERE NAME = '{}' and AREA_NAME='{}' ".format(
            self._community_detail["name"],
            self._community_detail["area_name"])
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(
                self._community_detail["name"]))
            return
        # 数据插入操作
        Dao.execute_dmls(self._insert_community())
Example #12
0
class CommunitiesListCrawler(BaseCrawler):
    global Dao
    Dao = Dao()

    def __init__(self):
        # TODO 用参数化和多线程来执行抓取

        super().__init__()
        self.detail_info_urls = []
        self.source_id = 21
        self._base_url = "http://newhouse.cnnbfdc.com/lpxx.aspx"
        self._community_detail = {
            'url': '',
            'name': '',
            'location': '',
            'area_name': '',
            'description': '',
            'latitude': '',
            'longitude': ''
        }

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """
        # 单个url
        html = self.get_page_content_str(seed_url)
        self._extract_data(html)

    # def get_page_content_str(self, url):
    # time.sleep(1)
    #
    # try:
    # print("现在开始抓取" + url)
    # headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
    #         request = urllib.request.Request(url=url, headers=headers)
    #         m_fp = urllib.request.urlopen(request, timeout=5500)
    #         html_str_uncode = m_fp.read()
    #         m_fp.close()
    #         return html_str_uncode
    #     except urllib.error.URLError as err:
    #         # logfile = open('test.log', 'a')
    #         # logfile.write("Error: {} \n in  url : {}".format(err, url))
    #         # logfile.close()
    #         # print("error in {}.get_page_content_str".format(__name__))
    #         # if url[-3:] == "htm":
    #         # time.sleep(120)
    #         # return self.get_page_content_str(url)
    #         return None
    #     except Exception  as err:
    #         print(err)
    #         return None

    def _generate_seed_url(self):
        """
        generate all url to visit
        """

        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        # 直接加,测试
        # self._visit_pages(self._base_url)
        # 实际操作,循环每一页
        for i in range(1, 70):
            visiturl = self._base_url + '?p=' + str(i)
            self._visit_pages(visiturl)

    def _extract_data(self, doc_str):
        doc = Pq(doc_str)
        tr_list = doc("td.sp_sck>table>tr")
        # total_item =int( doc("").text().strip())
        # count_num = int(total_item) / 12
        for tr in tr_list:
            try:
                doc = Pq(tr)
                # test =  doc(doc("tr")[1]).find("td")[1].text()
                self._community_detail['location'] = doc("td:eq(3)").text()
                self._community_detail['name'] = doc("a.sp_zi12c").text()
                self._community_detail['url'] = doc("a.sp_zi12c").attr("href")
                self._community_detail['area_name'] = doc("span.sp_f12").text()
                if (self._community_detail['name'] != ''):
                    self._save_community()
            except Exception as err:
                print(tr)
                print(err)
                continue

    def _insert_community(self):
        result = "INSERT INTO  communities (ORIGINAL_URL,NAME,AREA_NAME,LATITUDE,LONGITUDE,address,source_id )" \
                 " VALUES ('{}','{}','{}','{}','{}' ,'{}','{}' )".format(self._community_detail["url"],
                                                                         self._community_detail["name"],
                                                                         self._community_detail["area_name"],
                                                                         self._community_detail["latitude"],
                                                                         self._community_detail["longitude"],
                                                                         self._community_detail["location"],
                                                                         self.source_id)
        return result

    def _save_community(self):
        # 表中是否已有记录
        query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format(
            self._community_detail["url"], self.source_id)
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(
                self._community_detail["name"]))
            return
        # 数据插入操作
        Dao.execute_dmls(self._insert_community())

    def craw(self):
        self._generate_seed_url()
class CommunitiesListCrawler(BaseCrawler):
    global Dao
    Dao = Dao()

    def __init__(self):
        # TODO 用参数化和多线程来执行抓取

        super().__init__()
        self.detail_info_urls = []
        self.source_id = 18
        self._base_url = "http://bj.5i5j.com"
        self._community_detail = {
            'url': '',
            'name': '',
            'location': '',
            'area_name': '',
            'description': '',
            'latitude': '',
            'longitude': ''

        }

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        # for single_url in seed_url:
        #     update_sql = "   UPDATE  fetch_list SET  times = times+1 WHERE url = '{}'and source_id =17".format(
        #         single_url[0])
        #     Dao.execute_dmls(update_sql)
        #     self._base_url = single_url[0]
        #     self._now_url = single_url[0]
        #     html = self.get_page_content_str(single_url[0])
        #     try:
        #         self._extract_data(html)
        #     except Exception as e:
        #         print(e)
        #         update_sql = "   UPDATE  fetch_list SET  status  = 1 WHERE url = '{}'and source_id =17".format(
        #             single_url[0])
        #         Dao.execute_dmls(update_sql)

        # 单个url
        html = self.get_page_content_str(self._seed_url[0])
        self.findEachArea(html)
        # b = set(self._resualt)
        # self._resualt=[i for  i in b]
        # # dao=Dao()
        # insert_sql=""
        # for res1 in b :
        # insert_sql = "INSERT INTO merchant_tmp (description,url )VALUES ( '{}', 'http://www.youlin.me/category/407')".format(res1)
        # print( insert_sql  )
        # dao = Dao()
        # dao.execute_dmls(insert_sql)

    def _generate_seed_url(self):
        """
        generate all url to visit
        """

        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        # 直接加,测试
        self._seed_url.append("http://bj.5i5j.com/community/")



    def findEachArea(self,HTML):
        doc = Pq(HTML)
        li= doc(".c-info>.s-term2>li")[0]
        aList = doc(li).find("a")
        hrefList=[]
        for a in aList :
            a = doc(a).attr("href")
            if a != "/community/" and a!="/community/chaoyang/" and a != "/community/haidian/" and a != "/community/fengtai/"  and a!= "/community/dongcheng/" :
                hrefList.append(self._base_url + a)
        for href in hrefList :
            html = self.get_page_content_str(href)
            doc1 = Pq(html)
            aList1 = doc1(".c-info>.s-term2>li>.pop1>ul>.text2>a")
            for a in aList1 :
                a = doc(a).attr("href")
                self._extract_data(self._base_url + a)

    def _extract_data(self, url):
        doc_str = self.get_page_content_str(url)
        if "抱歉,没有找到符合您要求的小区" in doc_str :
            return False
        doc = Pq(doc_str)
        total_item =int( doc(".main-l>.reorder>b>font").text().strip())
        count_num = int(total_item) / 12
        for page in range(1, int(count_num) + 2):
            url1 = url + "n" + str(page)
            html = self.get_page_content_str(url1)
            self._extract_data2(html)

    def _extract_data2(self, doc_str):
        doc = Pq(doc_str)
        li_list = doc(".main-l>ul[class^='house-list']>li")
        for li in li_list:
            self._community_detail['latitude'] = doc(li).attr("y")
            self._community_detail['longitude'] =doc(li).attr("x")
            self._community_detail["url"] = self._base_url + doc(li).find("dl>dd>.xqbt>a").attr("href")
            self._community_detail["name"] = doc(li).find("dl>dd>.xqbt>a").text()
            self._community_detail["area_name"] = "".join(doc(li).find("dl>dd>.pw1015>span[class^='add']>a").text())
            print(self._community_detail)
            # url = doc(li).find(".details>.p_links>a").attr("href")
            self._save_community()


    def _insert_community(self):
        result = "INSERT INTO  communities (ORIGINAL_URL,NAME,AREA_NAME,LATITUDE,LONGITUDE,source_id )" \
                 " VALUES ('{}','{}','{}','{}','{}' ,'{}' )".format(self._community_detail["url"],
                                                                         self._community_detail["name"],
                                                                         self._community_detail["area_name"],
                                                                         self._community_detail["latitude"],
                                                                         self._community_detail["longitude"],
                                                                         self.source_id)
        return result

    def _save_community(self):
        # 表中是否已有记录
        query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id = 18 ".format(self._community_detail["url"])
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(self._community_detail["name"]))
            return
        # 数据插入操作
        Dao.execute_dmls(self._insert_community())
class CommunitiesListCrawler(BaseCrawler):
    global Dao
    Dao = Dao()

    def __init__(self):
        # TODO 用参数化和多线程来执行抓取

        super().__init__()
        self.detail_info_urls = []
        self.source_id = 21
        self._base_url = "http://www.njhouse.com.cn/persalereg.php"
        self._community_detail = {
            'url': '',
            'name': '',
            'location': '',
            'area_name': '',
            'description': '',
            'latitude': '',
            'longitude': ''
        }

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        # for single_url in seed_url:
        #     update_sql = "   UPDATE  fetch_list SET  times = times+1 WHERE url = '{}'and source_id =17".format(
        #         single_url[0])
        #     Dao.execute_dmls(update_sql)
        #     self._base_url = single_url[0]
        #     self._now_url = single_url[0]
        #     html = self.get_page_content_str(single_url[0])
        #     try:
        #         self._extract_data(html)
        #     except Exception as e:
        #         print(e)
        #         update_sql = "   UPDATE  fetch_list SET  status  = 1 WHERE url = '{}'and source_id =17".format(
        #             single_url[0])
        #         Dao.execute_dmls(update_sql)

        # 单个url
        html = self.get_page_content_str(self._seed_url[0])
        self._extract_data(html)
        # b = set(self._resualt)
        # self._resualt=[i for  i in b]
        # # dao=Dao()
        # insert_sql=""
        # for res1 in b :
        # insert_sql = "INSERT INTO merchant_tmp (description,url )VALUES ( '{}', 'http://www.youlin.me/category/407')".format(res1)
        # print( insert_sql  )
        # dao = Dao()
        # dao.execute_dmls(insert_sql)

    def get_page_content_str(self, url):
        time.sleep(1)

        try:
            print("现在开始抓取" + url)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
            }
            request = urllib.request.Request(url=url, headers=headers)
            m_fp = urllib.request.urlopen(request, timeout=5500)
            html_str_uncode = m_fp.read()
            m_fp.close()
            return html_str_uncode
        except urllib.error.URLError as err:
            # logfile = open('test.log', 'a')
            # logfile.write("Error: {} \n in  url : {}".format(err, url))
            # logfile.close()
            # print("error in {}.get_page_content_str".format(__name__))
            # if url[-3:] == "htm":
            # time.sleep(120)
            #     return self.get_page_content_str(url)
            return None
        except Exception as err:
            print(err)
            return None

    def _generate_seed_url(self):
        """
        generate all url to visit
        """

        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        # 直接加,测试
        self._seed_url.append(self._base_url)

    def _extract_data(self, doc_str):
        doc = Pq(doc_str)
        tables = doc("table>tr>td>table")
        # total_item =int( doc("").text().strip())
        # count_num = int(total_item) / 12
        for table in tables:
            try:
                doc = Pq(table)
                # test =  doc(doc("tr")[1]).find("td")[1].text()
                self._community_detail['location'] = Pq(
                    doc("tr:eq(1)"))("td:eq(1)").text()
                self._community_detail['name'] = Pq(
                    doc("tr:eq(2)"))("a").text()
                self._community_detail['url'] = Pq(
                    doc("tr:eq(2)"))("a").attr("href")
                self._community_detail['area_name'] = Pq(
                    doc("tr:eq(8)"))("td:eq(1)").text()

                self._save_community()
            except Exception as err:
                print(table)
                print(err)
                continue

    def _insert_community(self):
        result = "INSERT INTO  communities (ORIGINAL_URL,NAME,AREA_NAME,LATITUDE,LONGITUDE,location,source_id )" \
                 " VALUES ('{}','{}','{}','{}','{}' ,'{}','{}' )".format(self._community_detail["url"],
                                                                    self._community_detail["name"],
                                                                    self._community_detail["area_name"],
                                                                    self._community_detail["latitude"],
                                                                    self._community_detail["longitude"],
                                                                    self._community_detail["location"],
                                                                    self.source_id)
        return result

    def _save_community(self):
        # 表中是否已有记录
        query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format(
            self._community_detail["url"], self.source_id)
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(
                self._community_detail["name"]))
            return
        # 数据插入操作
        Dao.execute_dmls(self._insert_community())
class CommunitiesListCrawler(BaseCrawler):
    global Dao
    Dao = Dao()

    def __init__(self):
        # TODO 用参数化和多线程来执行抓取

        super().__init__()
        self.detail_info_urls = []
        self.source_id = 17
        self._base_url = "http://sz.ganji.com/"
        self._community_detail = {
            'url': '',
            'name': '',
            'location': '',
            'area_name': '',
            'description': '',
            'latitude': '',
            'longitude': ''
        }

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        for single_url in seed_url:
            update_sql = "   UPDATE  fetch_list SET  times = times+1 WHERE url = '{}'and source_id =17".format(
                single_url[0])
            Dao.execute_dmls(update_sql)
            self._base_url = single_url[0]
            self._now_url = single_url[0]
            html = self.get_page_content_str(single_url[0])
            try:
                self._extract_data(html)
            except Exception as e:
                print(e)
                update_sql = "   UPDATE  fetch_list SET  status  = 1 WHERE url = '{}'and source_id =17".format(
                    single_url[0])
                Dao.execute_dmls(update_sql)

                # 单个url
                # html = self.get_page_content_str(self._seed_url[0])
                # self._extract_data(html)
                # b = set(self._resualt)
                # self._resualt=[i for  i in b]
                # # dao=Dao()
                # insert_sql=""
                # for res1 in b :
                # insert_sql = "INSERT INTO merchant_tmp (description,url )VALUES ( '{}', 'http://www.youlin.me/category/407')".format(res1)
                # print( insert_sql  )
                # dao = Dao()
                # dao.execute_dmls(insert_sql)

    def _generate_seed_url(self):
        """
        generate all url to visit
        """

        # from page 1 to anypage which < 200

        # 从数据库添加
        self._seed_url = Dao._get_url_by_id(self.source_id)

        # 直接加,测试
        # self._seed_url.append("http://www.anjuke.com/index.html")

    def _extract_data(self, doc_str):
        doc = Pq(doc_str)
        total_item = doc(".PL_1.clearfix>span>em").text()
        count_num = int(total_item) / 10
        for page in range(1, int(count_num) + 1):
            url = self._base_url + "W0QQpZ" + str(page)
            html = self.get_page_content_str(url)
            self._extract_data2(html)

    def _extract_data2(self, doc_str):
        doc = Pq(doc_str)
        li_list = doc(".mainListing.clearfix>.pL>.list>li")
        for li in li_list:
            self._community_detail["url"] = doc(li).find(
                ".details>div>a").attr("href")
            self._community_detail["name"] = doc(li).find(
                ".details>div>a").text()
            p = doc(li).find(".details>p")
            self._community_detail["location"] = doc(p[0]).text()
            self._community_detail["area_name"] = self._community_detail[
                "location"][self._community_detail["location"].index("[") +
                            1:self._community_detail["location"].index("]")]
            self._community_detail["location"] = self._community_detail[
                "location"][self._community_detail["location"].index("]") + 1:]

            url = doc(li).find(".details>.p_links>a").attr("href")
            self._community_detail['latitude'] = url[url.index("l1=") +
                                                     3:url.index("&l2")]
            self._community_detail['longitude'] = url[url.index("l2=") +
                                                      3:url.index("&l3")]
            self._save_community()

    def _insert_community(self):
        result = "INSERT INTO  communities (ORIGINAL_URL,NAME,address,AREA_NAME,LATITUDE,LONGITUDE,source_id )" \
                 " VALUES ('{}','{}','{}','{}','{}','{}' ,'{}' )".format(self._community_detail["url"],
                                                                         self._community_detail["name"],
                                                                         self._community_detail["location"],
                                                                         self._community_detail["area_name"],
                                                                         self._community_detail["latitude"],
                                                                         self._community_detail["longitude"],
                                                                         self.source_id)
        return result

    def _save_community(self):
        # 表中是否已有记录
        query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}'".format(
            self._community_detail["url"])
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(
                self._community_detail["name"]))
            return
        # 数据插入操作
        Dao.execute_dmls(self._insert_community())
class CommunitiesListCrawler(BaseCrawler):
    global Dao
    Dao = Dao()

    def __init__(self):
        # TODO 用参数化和多线程来执行抓取

        super().__init__()
        self.detail_info_urls = []
        self.source_id = 28
        self._base_url = "http://newhouse.cnnbfdc.com/GetHouseTable.aspx"
        self._apartment_detail = {
            'COMMUNITY_ID': 0,
            'BUILDING_NUM': '',
            'APARTMENT_NUM': '',
            'STATUS': '2',
            'create_time': ''
        }

    def _visit_pages(self, seed_url, apartment_detail):
        """
        visit one url,get page content
        """

        # 单个url
        # html = self.get_page_content_str(self._seed_url[0]) #用数据库的时候
        endurl = seed_url[seed_url.index("?"):seed_url.index("&projectid")]
        seed_url = self._base_url + endurl
        html = self.get_page_content_str(seed_url)  # 单个URL
        self._extract_data(html, apartment_detail)
        # b = set(self._resualt)
        # self._resualt=[i for  i in b]
        # # dao=Dao()
        # insert_sql=""
        # for res1 in b :
        # insert_sql = "INSERT INTO merchant_tmp (description,url )VALUES ( '{}', 'http://www.youlin.me/category/407')".format(res1)
        # print( insert_sql  )
        # dao = Dao()
        # dao.execute_dmls(insert_sql)

    def _generate_seed_url(self):
        """
        generate all url to visit
        """
        # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510"
        # self._visit_pages(self._seed_url)
        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        querysql = "SELECT  COMMUNITY_ID, BUILDING_NUM, URL  FROM ehdc.buildings WHERE STATUS = 0  ; "
        result = Dao.execute_query(querysql)
        for COMMUNITY_ID, BUILDING_NUM, URL in result:
            self.execute(COMMUNITY_ID, BUILDING_NUM, URL)

    def execute(self, COMMUNITY_ID, BUILDING_NUM, URL):
        try:
            apartment_detail = {
                'COMMUNITY_ID': 0,
                'BUILDING_NUM': '',
                'APARTMENT_NUM': '',
                'STATUS': '2',
                'create_time': ''
            }
            apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID)
            apartment_detail["BUILDING_NUM"] = BUILDING_NUM
            apartment_detail["create_time"] = time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            self._visit_pages(URL, apartment_detail)
            sql = "update BUILDINGS set status = '2' where URL = '{}' ; ".format(
                URL)
            Dao.execute_dmls(sql)
        except Exception as e:
            print(e)
            sql = "update BUILDINGS set status = -1 where URL = '{}' ; ".format(
                URL)
            Dao.execute_dmls(sql)

    def findEachBuilding(self, html):
        doc = Pq(html)
        a_list = doc("a.e_huangse")
        for a in a_list:
            self._apartment_detail["BUILDING_NUM"] = doc(a).text()
            href = doc(a).attr("onclick")
            href = href[href.index("'") + 1:]
            href = href[:href.index("'")]
            url = self._base_url + href
            # doc_str = self.get_page_content_str(url)
            # elf._extract_data(doc_str)
            # time.sleep(1)
            self.save_building(url)

    def _extract_data(self, doc_str, apartment_detail):
        try:
            doc = Pq(doc_str)
            a_list = doc("table>tr>td>table>tr>td")
            # total_item =int( doc("").text().strip())
            # count_num = int(total_item) / 12
            for a in a_list:
                apartment_detail["APARTMENT_NUM"] = doc(a).text()
                if apartment_detail["APARTMENT_NUM"].strip() != '':
                    apartment_detail["create_time"] = time.strftime(
                        '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                    self._save_community(apartment_detail)
        except Exception as err:
            print(err)
            time.sleep(1)
            self._extract_data(doc_str)

    def _insert_apartment(self, apartment_detail):
        result = "INSERT INTO  apartments (COMMUNITY_ID , BUILDING_NUM , APARTMENT_NUM ,STATUS ,create_time  )" \
                 " VALUES ('{}','{}','{}','{}','{}'  )".format(apartment_detail["COMMUNITY_ID"],
                                                               apartment_detail["BUILDING_NUM"],
                                                               apartment_detail["APARTMENT_NUM"],
                                                               apartment_detail["STATUS"],
                                                               apartment_detail["create_time"])
        return result

    def _save_community(self, apartment_detail):
        # 表中是否已有记录
        query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID  = {}  and BUILDING_NUM ='{}'  and APARTMENT_NUM ='{}' ".format(
            int(apartment_detail["COMMUNITY_ID"]),
            apartment_detail["BUILDING_NUM"],
            apartment_detail["APARTMENT_NUM"])
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(
                str(apartment_detail["COMMUNITY_ID"]) +
                apartment_detail["BUILDING_NUM"] +
                apartment_detail["APARTMENT_NUM"]))
            return
        # 数据插入操作
        try:
            Dao.execute_dmls(self._insert_apartment(apartment_detail))
        except Exception as e:
            print(e)

    def craw(self):
        self._generate_seed_url()

    def save_building(self, url):
        SQL = " INSERT INTO ehdc.buildings (COMMUNITY_ID,BUILDING_NUM,URL,STATUS) VALUES ('{}','{}','{}','{}') ;".format(
            str(self._apartment_detail["COMMUNITY_ID"]),
            self._apartment_detail["BUILDING_NUM"], url,
            self._apartment_detail["STATUS"])
        Dao.execute_dmls(SQL)
Example #17
0
class CategoryListCrawler(BaseCrawler):
    global Dao
    Dao = Dao()

    def __init__(self):
        super().__init__()
        self.detail_info_urls = []
        self.source_id = 17
        self._base_url = "http://www.dianping.com/wuhan"
        self._category_detail = {'shopType': '', 'categoryId': '', 'name': ''}
        self._category_list = []

    def get_page_content_str(self, url):
        print("现在开始抓取" + url)
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
        }
        request = urllib.request.Request(url=url, headers=headers)
        m_fp = urllib.request.urlopen(request, timeout=500)
        html_str = m_fp.read().decode('utf-8')
        m_fp.close()
        return html_str

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """
        # 单个url
        html = self.get_page_content_str(self._seed_url[0])
        self._extract_data(html)

    def _generate_seed_url(self):
        """
        generate all url to visit
        """
        # 直接加,测试
        self._seed_url.append(self._base_url)

    def _extract_data(self, doc_str):
        doc = Pq(doc_str)
        li_list = doc('.aside.aside-left>.category-nav.J-category-nav>li')
        for li in li_list:
            self._category_detail["shopType"] = doc(li).attr("data-key")
            self._category_detail["categoryId"] = self._category_detail[
                "shopType"]
            self._category_detail["name"] = doc(li).find(".name>span").text()
            self._category_list.append(copy.copy(self._category_detail))
            # doc2   = Pq(doc_str)
            # div_list = doc2(".aside.aside-left>.category-nav.J-category-nav>li>.secondary-category.J-secondary-category>div>div")
            a_list = doc(li).find("div>a")
            for a in a_list:
                self._category_detail["categoryId"] = doc(a).attr("data-key")
                self._category_detail["name"] = doc(a).text()
                self._category_list.append(copy.copy(self._category_detail))

        self.save_category()

    def craw(self):
        self._generate_seed_url()
        self._visit_pages(self._seed_url)
        print(self._seed_url)

    def save_category(self):
        for category in self._category_list:
            sql = "INSERT INTO category (shopType,categoryId,name )  VALUES ('{}','{}','{}' )".format(
                category["shopType"], category["categoryId"], category["name"])
            Dao.execute_dmls(sql)
Example #18
0
class CommunitiesListCrawler(BaseCrawler, threading.Thread):
    global Dao
    Dao = Dao()

    def __init__(self):

        super().__init__()
        self.detail_info_urls = []
        self.source_id = 31
        self._base_url = "http://www.tywsfdc.com/"
        self._root_url = "http://www.tywsfdc.com/Firsthand/tyfc/publish/p/ProNBList.do?pid"
        self._apartment_detail = {
            'COMMUNITY_ID': 0,
            'BUILDING_NUM': '',
            'APARTMENT_NUM': '',
            'STATUS': '2',
            'create_time': ''
        }

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        # 单个url
        # html = self.get_page_content_str(self._seed_url[0]) #用数据库的时候
        self._pid = seed_url[seed_url.rindex("-"):]
        seed_url = self._root_url + "=" + self._pid
        # print("_visit_pages " + seed_url)
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
            'Referer': seed_url
        }
        values = {'pid': self._pid, 'pageNo': '1', 'pageSize': '50'}
        data = urllib.parse.urlencode(values).encode(encoding='UTF8')
        request = urllib.request.Request(
            url="http://www.tywsfdc.com/Firsthand/tyfc/publish/ProNBList.do",
            headers=headers,
            data=data)

        m_fp = urllib.request.urlopen(request, timeout=500)
        html_str = m_fp.read().decode("utf8")
        self.findEachBuilding(html_str)
        # b = set(self._resualt)
        # self._resualt=[i for  i in b]
        # # dao=Dao()
        # insert_sql=""
        # for res1 in b :
        # insert_sql = "INSERT INTO merchant_tmp (description,url )VALUES ( '{}', 'http://www.youlin.me/category/407')".format(res1)
        # print( insert_sql  )
        # dao = Dao()
        # dao.execute_dmls(insert_sql)

    def get_page_content_str(self, url):

        try:
            print("现在开始抓取" + url)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
            }
            request = urllib.request.Request(url=url, headers=headers)
            m_fp = urllib.request.urlopen(request, timeout=1500)
            html_str_uncode = m_fp.read()
            if html_str_uncode == '':
                print("出问题了,没出来数据")
                return self.get_page_content_str(url)
            m_fp.close()
            return html_str_uncode
        except urllib.error.URLError as err:
            return None
        except Exception as err:
            print(err)
            return None

    def _generate_seed_url(self):
        """
        generate all url to visit
        """
        # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510"
        # self._visit_pages(self._seed_url)
        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        querysql = "SELECT COMMUNITY_ID,ORIGINAL_URL FROM communities WHERE   source_id ='{}' and status<2 ; ".format(
            self.source_id)
        result = Dao.execute_query(querysql)
        for COMMUNITY_ID, ORIGINAL_URL in result:
            try:
                self._apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID)
                self._apartment_detail["create_time"] = time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                # print("_generate_seed_url func : "+ORIGINAL_URL)
                self._visit_pages(ORIGINAL_URL)
                sql = "update communities set status = '2' where COMMUNITY_ID = '{}' ".format(
                    int(COMMUNITY_ID))
                Dao.execute_dmls(sql)
            except Exception as e:
                print(e)
                sql = "update communities set status = '-1' where COMMUNITY_ID = '{}' ".format(
                    int(COMMUNITY_ID))
                Dao.execute_dmls(sql)

                # 直接加,测试
                # self._seed_url.append(self._base_url)

    def findEachBuilding(self, html):
        doc = Pq(html)
        tr_list = doc("table>tr")
        # print("tr size ")
        for tr in tr_list:
            try:
                # 进入每一栋的url
                objid = doc(tr).attr("objid")
                if objid == None:
                    continue
                self._apartment_detail["BUILDING_NUM"] = Pq(tr)(
                    "td:eq(2)").text()
                url = "http://www.tywsfdc.com/Firsthand/tyfc/publish/p/ProNBView.do?proPID={}&nbid={}".format(
                    self._pid, objid)
                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
                    'Referer': url
                }
                RequestURL = "http://www.tywsfdc.com/Firsthand/tyfc/publish/probld/NBView.do?nid={}&projectid={}".format(
                    objid, self._pid)
                values = {'nid': objid, 'projectid': self._pid}
                data = urllib.parse.urlencode(values).encode(encoding='UTF8')
                request = urllib.request.Request(url=RequestURL,
                                                 headers=headers,
                                                 data=data)

                m_fp = urllib.request.urlopen(request, timeout=500)
                html_str = m_fp.read().decode("utf8")
                self._extract_data(html_str)
            except Exception as e:
                print(e)
                pass

    def _extract_data(self, doc_str):
        try:
            doc = Pq(doc_str)
            # 每一单元
            building_list = doc("ul#bldlist>span")
            for building in building_list:
                bld = doc(building).attr("id")
                bld = bld[3:]
                self._apartment_detail["BUILDING_NUM"] = doc(building).text()
                # 每一层:
                xpath = "div.flrlist>table#{}>tr".format(bld)
                tr_list = doc(xpath)
                # total_item =int( doc("").text().strip())
                # count_num = int(total_item) / 12
                for tr in tr_list:
                    self._apartment_detail["FLOOR_NUM"] = Pq(tr)(
                        "td:eq(0)").text()
                    a_list = Pq(tr)("td:eq(1)>span>a")
                    for a in a_list:
                        self._apartment_detail["APARTMENT_NUM"] = doc(a).text()
                        if self._apartment_detail["APARTMENT_NUM"].strip(
                        ) != '':
                            self._apartment_detail[
                                "create_time"] = time.strftime(
                                    '%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
                            self._save_apartments()
        except Exception as err:
            print(err)
            time.sleep(100)
            self._extract_data(doc_str)

    def _insert_community(self):
        result = "INSERT INTO  apartments (COMMUNITY_ID , BUILDING_NUM ,FLOOR_NUM , APARTMENT_NUM ,STATUS ,create_time  )" \
                 " VALUES ('{}','{}','{}','{}','{}','{}'  )".format(self._apartment_detail["COMMUNITY_ID"],
                                                                    self._apartment_detail["BUILDING_NUM"],
                                                                    self._apartment_detail["FLOOR_NUM"],
                                                                    self._apartment_detail["APARTMENT_NUM"],
                                                                    self._apartment_detail["STATUS"],
                                                                    self._apartment_detail["create_time"])
        return result

    def _save_apartments(self):
        # 表中是否已有记录
        query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID  = {}  and BUILDING_NUM ='{}'  and APARTMENT_NUM ='{}'and FLOOR_NUM='{}' ; ".format(
            int(self._apartment_detail["COMMUNITY_ID"]),
            self._apartment_detail["BUILDING_NUM"],
            self._apartment_detail["APARTMENT_NUM"],
            self._apartment_detail["FLOOR_NUM"])
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(
                str(self._apartment_detail["COMMUNITY_ID"]) +
                self._apartment_detail["BUILDING_NUM"] +
                self._apartment_detail["APARTMENT_NUM"]))
            return
        # 数据插入操作
        try:
            Dao.execute_dmls(self._insert_community())
        except Exception as e:
            print(e)

    def craw(self):
        self._generate_seed_url()
class CommunitiesListCrawler(BaseCrawler):
    global Dao
    Dao = Dao()

    def __init__(self):
        # TODO 用参数化和多线程来执行抓取

        super().__init__()
        self.detail_info_urls = []
        self.source_id = 22
        self._base_url = "http://www.funi.com"
        self._community_detail = {
            'url': '',
            'name': '',
            'location': '',
            'area_name': '',
            'description': '',
            'latitude': '',
            'longitude': ''

        }

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        # for single_url in seed_url:
        # update_sql = "   UPDATE  fetch_list SET  times = times+1 WHERE url = '{}'and source_id =17".format(
        # single_url[0])
        #     Dao.execute_dmls(update_sql)
        #     self._base_url = single_url[0]
        #     self._now_url = single_url[0]
        #     html = self.get_page_content_str(single_url[0])
        #     try:
        #         self._extract_data(html)
        #     except Exception as e:
        #         print(e)
        #         update_sql = "   UPDATE  fetch_list SET  status  = 1 WHERE url = '{}'and source_id =17".format(
        #             single_url[0])
        #         Dao.execute_dmls(update_sql)

        # 单个url
        html = self.get_page_content_str(self._seed_url[0])
        self.findEachArea(html)
        # b = set(self._resualt)
        # self._resualt=[i for  i in b]
        # # dao=Dao()
        # insert_sql=""
        # for res1 in b :
        # insert_sql = "INSERT INTO merchant_tmp (description,url )VALUES ( '{}', 'http://www.youlin.me/category/407')".format(res1)
        # print( insert_sql  )
        # dao = Dao()
        # dao.execute_dmls(insert_sql)

    def _generate_seed_url(self):
        """
        generate all url to visit
        """

        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        # 直接加,测试
        self._seed_url.append("http://www.funi.com/loupan/region_605_0_HOUSE_0_1")


    def findEachArea(self, HTML):
        doc = Pq(HTML)
        aList = doc(".house-search>.s-con>dl>dd>a")
        for a in aList:
            self._community_detail["area_name"] = doc(a).text()
            a = doc(a).attr("href")
            if a != "/loupan/region_0_0_HOUSE_0_1" and a != "/community/chaoyang/" and a != "/community/haidian/" and a != "/community/fengtai/" and a != "/community/dongcheng/":
                self._extract_data(self._base_url + a)

    def _extract_data(self, url):
        doc_str = self.get_page_content_str(url)
        if "抱歉,没有找到符合您要求的小区" in doc_str:
            return False
        doc = Pq(doc_str)
        total_item = int(doc(".fleft>.pan-tab>dl>dt>p>i").text().strip())
        count_num = int(total_item) / 10
        for page in range(1, int(count_num) + 2):
            url1 = url[:url.rindex("_")] + "_" + str(page)
            html = self.get_page_content_str(url1)
            self._extract_data2(html)

    def _extract_data2(self, doc_str):
        doc = Pq(doc_str)
        li_list = doc(".fleft>.pan-con.maplist.clearfix>dl")
        for li in li_list:
            self._community_detail['location'] = doc(li).find("i.address").attr("title")
            self._community_detail["url"] = self._base_url + doc(li).find(".clearfix>h2>a").attr("href")
            self._community_detail["name"] = doc(li).find(".clearfix>h2>a").text()
            print(self._community_detail)
            # url = doc(li).find(".details>.p_links>a").attr("href")
            self._save_community()


    def _insert_community(self):
        result = "INSERT INTO  communities (ORIGINAL_URL,NAME,AREA_NAME,LATITUDE,LONGITUDE,location,source_id )" \
                 " VALUES ('{}','{}','{}','{}','{}' ,'{}','{}' )".format(self._community_detail["url"],
                                                                    self._community_detail["name"],
                                                                    self._community_detail["area_name"],
                                                                    self._community_detail["latitude"],
                                                                    self._community_detail["longitude"],
                                                                    self._community_detail["location"],
                                                                    self.source_id)
        return result

    def _save_community(self):
        # 表中是否已有记录
        query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format(
            self._community_detail["url"], self.source_id)
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(self._community_detail["name"]))
            return
        # 数据插入操作
        Dao.execute_dmls(self._insert_community())