Exemple #1
0
 def __init__(self, city=None):
     # 如果是爬最新房源,最大数没有限制
     self.max_count = -1
     self.lian_jia_session = LianJiaSession(city)
     self.__yaml_data = self.lian_jia_session.get_prop()
     self.base_url = self.lian_jia_session.get_city_url()
     self.__logger = self.lian_jia_session.get_logger()
     self.sql_session = self.lian_jia_session.get_sql_session()
Exemple #2
0
class LianJia:
    def __init__(self, city=None):
        self.lian_jia_session = LianJiaSession(city)
        self.__yaml_data = self.lian_jia_session.get_prop()
        self.__house_list = []
        self.__logger = self.lian_jia_session.get_logger()
        self.sql_session = self.lian_jia_session.get_sql_session()

    def parse(self, is_breaking=False):
        utils.reset_xiao_qu_status(self.sql_session, is_breaking)
        xiao_qu_queue = self.__get_xiao_qu_map()
        xiao_qu_id_soup_queue = queue.Queue()
        producer_arr = []
        consumer_arr = []
        for i in range(self.__yaml_data['producer_num']):
            producer = XiaoQuHouses(self.lian_jia_session, xiao_qu_queue,
                                    xiao_qu_id_soup_queue)
            producer.setName('producer - ' + str(i))
            producer_arr.append(producer)
            producer.start()
        for i in range(self.__yaml_data['consumer_num']):
            consumer = ParseXiaoQuPage(self.lian_jia_session,
                                       xiao_qu_id_soup_queue)
            consumer.setName('consumer - ' + str(i))
            consumer.start()
            consumer_arr.append(consumer)

        for item in producer_arr:
            print(sys._getframe().f_lineno)
            if item.is_alive():
                print(sys._getframe().f_lineno)
                item.join()
        self.__logger.info('producer completed!.....................')
        print(sys._getframe().f_lineno)
        for i in consumer_arr:
            print(sys._getframe().f_lineno)
            xiao_qu_id_soup_queue.put((None, None))
            xiao_qu_id_soup_queue.put((None, None))
        for item in consumer_arr:
            print(sys._getframe().f_lineno)
            if item.is_alive():
                print(sys._getframe().f_lineno)
                item.join()
        print(sys._getframe().f_lineno)
        self.__logger.info('consumer completed!.....................')

    def __get_xiao_qu_map(self):
        xiao_qus = self.sql_session.query(XiaoQu).filter(
            XiaoQu.status == False).filter(
                XiaoQu.zai_shou >= self.__yaml_data['min_house']).all()
        xiao_qu_quenue = queue.Queue()
        for item in xiao_qus:
            xiao_qu_quenue.put(item)
        self.__logger.info('发现小区数量 : [{0}](小区房源数量至少{1})'.format(
            xiao_qu_quenue.qsize(), self.__yaml_data['min_house']))
        return xiao_qu_quenue
Exemple #3
0
 def __init__(self, city=None):
     self.__lian_jia_session = LianJiaSession(city)
     self.__yaml_data = self.__lian_jia_session.get_prop()
     self.__logger = self.__lian_jia_session.get_logger()
     self.__sql_session = self.__lian_jia_session.get_sql_session()
     self.__base_url = self.__lian_jia_session.get_city_url()
     
     self.__xiao_qu_urls = set()
     xiao_qus = self.__sql_session.query(XiaoQu).all()
     for item in xiao_qus:
         self.__xiao_qu_urls.add(item.url)
Exemple #4
0
 def __init__(self, city=None):
     self.__lian_jia_session = LianJiaSession(city)
     self.__yaml_data = self.__lian_jia_session.get_prop()
     self.__house_list = []
     self.__logger = self.__lian_jia_session.get_logger()
     self.__sql_session = self.__lian_jia_session.get_sql_session()
     self.__base_url = self.__lian_jia_session.get_city_url()
     # 为了防止名字相同,重复录入
     self.__district_names = set()
     # 添加已有district 信息
     self.get_exixt_district_url()
Exemple #5
0
class AllDistrict:
    def __init__(self, city=None):
        self.__lian_jia_session = LianJiaSession(city)
        self.__yaml_data = self.__lian_jia_session.get_prop()
        self.__house_list = []
        self.__logger = self.__lian_jia_session.get_logger()
        self.__sql_session = self.__lian_jia_session.get_sql_session()
        self.__base_url = self.__lian_jia_session.get_city_url()
        # 为了防止名字相同,重复录入
        self.__district_names = set()
        # 添加已有district 信息
        self.get_exixt_district_url()

    def parse(self):
        a_arr = self.__parse_html('{0}/xiaoqu/'.format(self.__base_url))
        districts = self.__parse_a(a_arr)
        self.__sql_session.add_all(districts)
        self.__sql_session.commit()

        for item in districts:
            a_arr = self.__parse_html(self.__base_url + item.url)
            districts = self.__parse_a(a_arr, item.id)
            self.__sql_session.add_all(districts)
        self.__sql_session.commit()

    def __parse_html(self, url):
        rep = self.__lian_jia_session.get(url)
        soup = BeautifulSoup(rep.text, 'lxml')
        a_arr = soup.select('.m-filter .position a')
        return a_arr

    def __parse_a(self, items, parent_id=None):
        districts = []
        for item in items:
            if hasattr(item, 'href'):
                name = str(item.string)
                url = item['href']
                if name not in self.__district_names and url not in exclude_url:
                    district = District()
                    district.name = name
                    district.url = str(url)
                    if parent_id:
                        district.parent = parent_id
                    self.__logger.info(
                        'add district name[{0}], url[{1}], parentId[{2}]'.
                        format(name, url, parent_id))
                    districts.append(district)
                    self.__district_names.add(name)
        return districts

    def get_exixt_district_url(self):
        districts = self.__sql_session.query(District).all()
        for d in districts:
            self.__district_names.add(d.name)
Exemple #6
0
 def __init__(self, report_date, city=None):
     lj_session = LianJiaSession(city)
     self.city = lj_session.city
     self.city_zh = lj_session.get_city_zh()
     engine = lj_session.get_sql_engine()
     self.__yaml_data = lj_session.get_prop()
     self.root_path = lj_session.get_log_path()
     self.logging = lj_session.get_logger()
     self.log_file_path = lj_session.get_log_file_name()
     conn = engine.raw_connection()
     self.cursor = conn.cursor()
     self.query_time = report_date
     self.today = datetime.today()
     self.date_str = self.get_date_str()
Exemple #7
0
 def __init__(self, city=None):
     self.lian_jia_session = LianJiaSession(city)
     self.__yaml_data = self.lian_jia_session.get_prop()
     self.base_url = self.lian_jia_session.get_city_url()
     self.__logger = self.lian_jia_session.get_logger()
     self.sql_session = self.lian_jia_session.get_sql_session()
Exemple #8
0
class ChengJiaoHouse:
    def __init__(self, city=None):
        self.lian_jia_session = LianJiaSession(city)
        self.__yaml_data = self.lian_jia_session.get_prop()
        self.base_url = self.lian_jia_session.get_city_url()
        self.__logger = self.lian_jia_session.get_logger()
        self.sql_session = self.lian_jia_session.get_sql_session()

    def test(self, is_breaking=False):
        xiao_qus = self.sql_session.query(XiaoQu).filter(
            XiaoQu.status == False).filter(
                XiaoQu.zai_shou >= self.__yaml_data['min_house']).all()
        for xiao_qu in xiao_qus:
            cjs = self.sql_session.query(ChengJiao).filter(
                ChengJiao.xiao_qu == xiao_qu.id).all()
            if len(cjs) > 0:
                xiao_qu.status = True
        self.sql_session.commit()

    def parse(self, is_breaking=False):
        utils.reset_xiao_qu_status(self.sql_session, is_breaking)
        xiao_qu_queue = self.__get_xiao_qu_map()
        t = xiao_qu_queue.qsize()
        i = 0
        while not xiao_qu_queue.empty():
            i += 1
            self.__logger.info('总进度 ===> {:0.2f}'.format(i / t))
            xiao_qu = xiao_qu_queue.get()
            if xiao_qu is None:
                break
            url = 'https://wh.lianjia.com/chengjiao/c{0}/'.format(xiao_qu.url)
            self.parse_page(url, xiao_qu.id)
            xiao_qu.status = True
            self.sql_session.commit()

    def parse_latest(self):
        url = '{0}/chengjiao/'.format(self.base_url)
        self.parse_page(url)

    def parse_page(self, url, xiao_qu_id=None):
        rep = self.lian_jia_session.get(url)
        soup = BeautifulSoup(rep.text, 'lxml')
        total = soup.find('div',
                          attrs={'class',
                                 'total'}).find('span').get_text(strip=True)
        total = int(total)
        if total > 1000:
            self.__logger.info('error 小区[{0}] 发现房源数异常[{1}]'.format(url, total))
        else:
            self.__logger.info('发现总房源{0}套'.format(total))
            page_url_list = utils.get_all_page(soup)
            self.__parse_soup(soup, xiao_qu_id)
            # i = 0
            # t = len(page_url_list)
            for url in page_url_list:
                # i += 1
                # self.__logger.info('当前小区 progress {:0.2f}'.format(i/t))
                rep = self.lian_jia_session.get(self.base_url + url)
                soup = BeautifulSoup(rep.text, 'lxml')
                self.__parse_soup(soup, xiao_qu_id)

    def __get_xiao_qu_id_by_name(self, name):
        xiao_qus = self.sql_session.query(XiaoQu).filter(
            XiaoQu.name == name).all()
        if len(xiao_qus) == 1:
            return xiao_qus[0].id
        elif len(xiao_qus) > 1:
            self.__logger.info('error 找多个同名小区 名称[{0}]'.format(name))
            return None
        else:
            self.__logger.info('error 找不到小区 名称[{0}]'.format(name))
            return None

    def __parse_soup(self, soup, xiao_qu_id):
        li_arr = soup.select('ul.listContent li')
        cheng_jiao_list = []
        for li in li_arr:
            info_div = li.find('div', attrs={'class', 'info'})
            url, title = utils.get_url_title(info_div)
            exist = self.sql_session.query(ChengJiao).filter(
                ChengJiao.url == url).one_or_none()
            # 如果 已存在跳过
            if exist is None:
                cheng_jiao = ChengJiao()
                args = title.split(' ')
                if len(args) == 3:
                    if xiao_qu_id is None:
                        xiao_qu_id = self.__get_xiao_qu_id_by_name(args[0])
                    cheng_jiao.xiao_qu = xiao_qu_id
                    price = info_div.find('div', attrs={
                        'class', 'totalPrice'
                    }).find('span', attrs={'class',
                                           'number'}).get_text(strip=True)
                    unit_price = info_div.find(
                        'div', attrs={'class', 'unitPrice'
                                      }).find('span',
                                              attrs={'class', 'number'
                                                     }).get_text(strip=True)
                    cheng_jiao.price = float(price)
                    cheng_jiao.unit_price = float(unit_price)
                    deal_date = info_div.find('div',
                                              attrs={'class', 'dealDate'
                                                     }).get_text(strip=True)
                    y_m_d = deal_date.split('.')
                    deal_date = date(int(y_m_d[0]), int(y_m_d[1]),
                                     int(y_m_d[2]))
                    deal_cycle_txt = info_div.find(
                        'span', attrs={'class',
                                       'dealCycleTxt'}).get_text(strip=True)
                    flood = info_div.find('div',
                                          attrs={'class', 'positionInfo'
                                                 }).get_text(strip=True)
                    gua_pai_jia = deal_cycle_txt[deal_cycle_txt.find('牌') +
                                                 1:deal_cycle_txt.find('万')]
                    gua_pai_jia = float(gua_pai_jia)
                    zhou_qi = deal_cycle_txt[deal_cycle_txt.find('期') +
                                             1:deal_cycle_txt.find('天')]
                    zhou_qi = int(zhou_qi)
                    cheng_jiao.url = url
                    cheng_jiao.title = title
                    cheng_jiao.deal_date = deal_date
                    cheng_jiao.gua_pai_jia = gua_pai_jia
                    cheng_jiao.zhou_qi = zhou_qi
                    cheng_jiao.flood = flood
                    cheng_jiao.hu_xing = args[1]
                    cheng_jiao.area1 = float(
                        re.findall(r"\d+\.?\d*", args[2])[0])
                    cheng_jiao_list.append(cheng_jiao)
                    self.__logger.info(
                        'url[{0}] 标题[{1}] 价格[{2}] 单价[{3}] 成交日期[{4}]'.format(
                            url, title, gua_pai_jia, unit_price, deal_date))
            self.sql_session.add_all(cheng_jiao_list)
        self.sql_session.commit()

    def __get_xiao_qu_map(self):
        xiao_qus = self.sql_session.query(XiaoQu).filter(
            XiaoQu.status == False).filter(
                XiaoQu.zai_shou >= self.__yaml_data['min_house']).all()
        xiao_qu_quenue = queue.Queue()
        for item in xiao_qus:
            xiao_qu_quenue.put(item)
        self.__logger.info('发现小区数量 : [{0}](小区房源数量至少{1})'.format(
            xiao_qu_quenue.qsize(), self.__yaml_data['min_house']))
        return xiao_qu_quenue
Exemple #9
0
    price = Column(FLOAT(8))
    change_time = Column(DATETIME)
    create_time = Column(DATETIME, server_default=func.now())


def create_view(engine):
    conn = engine.raw_connection()
    cursor = conn.cursor()
    sql = 'CREATE VIEW price_change_com AS SELECT p.house_id AS house_id, p.pre_price AS pre_price,p.price AS price,( p.price - p.pre_price ) AS priceChange, Round( ( p.price - p.pre_price ) / p.pre_price * 100, 2 ) AS fudu,p.change_time AS change_time FROM price_change p'
    cursor.execute(sql)
    sql = 'CREATE VIEW district_area AS SELECT  d1.id id,  d2.NAME district,  d1.NAME area,  d1.url url FROM district d1,   district d2   WHERE   d2.id = d1.parent'
    cursor.execute(sql)


if __name__ == '__main__':
    view = False
    if len(sys.argv) == 3:
        filename, city, view = sys.argv
        lian_jia_session = LianJiaSession(city)
    if len(sys.argv) == 2:
        filename, city = sys.argv
        lian_jia_session = LianJiaSession(city)
    else:
        lian_jia_session = LianJiaSession()

    engine = lian_jia_session.get_sql_engine()

    base.metadata.create_all(engine)  # 创建表结构
    if view:
        create_view(engine)
Exemple #10
0
class AllXiaoQu:

    def __init__(self, city=None):
        self.__lian_jia_session = LianJiaSession(city)
        self.__yaml_data = self.__lian_jia_session.get_prop()
        self.__logger = self.__lian_jia_session.get_logger()
        self.__sql_session = self.__lian_jia_session.get_sql_session()
        self.__base_url = self.__lian_jia_session.get_city_url()
        
        self.__xiao_qu_urls = set()
        xiao_qus = self.__sql_session.query(XiaoQu).all()
        for item in xiao_qus:
            self.__xiao_qu_urls.add(item.url)

    def get_xiao_qu_list(self):
        i = 0
        district_list = self.__sql_session.query(District).filter(District.parent != None).all()
        t = len(district_list)/100
        for district in district_list:
            i += 1
            self.__logger.info('进度[{0:.2f}] 开始解析片区[{1}] id[{2}]===> url[{3}]'.format(
                i/t, district.name, district.id, district.url))
            xiao_qu_list = self.__parse_a_district(district)
            self.__sql_session.add_all(xiao_qu_list)
            self.__sql_session.commit()

    def __parse_a_district(self, district):
        rep = self.__lian_jia_session.get(self.__base_url + district.url)
        soup = BeautifulSoup(rep.text, 'lxml')

        xiao_qu_list = self.__parse_page(soup, district.id)
        page_url_list = utils.get_all_page(soup)
        for url in page_url_list:
            rep = self.__lian_jia_session.get(self.__base_url + url)
            soup = BeautifulSoup(rep.text, 'lxml')
            xiao_qu_list2 = self.__parse_page(soup, district.id)
            xiao_qu_list.extend(xiao_qu_list2)
        self.__logger.info('添加小区数量 ==> {0}'.format(len(xiao_qu_list)))
        return xiao_qu_list

    def __parse_page(self, soup, district):
        xiao_qu_list = []
        li_arr = soup.select('ul.listContent li')
        for li in li_arr:
            xiao_qu = XiaoQu()
            xiao_qu.district = district
            div = li.find('div', attrs={'class', 'info'})
            a = div.find('div', attrs={'class', 'title'}).a
            href = a['href']
            index = href.rfind('/', 0, len(href)-1)
            xiao_qu.url = href[index+1: len(href)-1]
            name = a.get_text(strip=True)
            if xiao_qu.url not in self.__xiao_qu_urls:
                self.__xiao_qu_urls.add(xiao_qu.url)
                xiao_qu.name = name
                house_info = div.find('div', attrs={'class', 'houseInfo'}).get_text(strip=True)
                xiao_qu.cheng_jiao_90 = int(house_info[house_info.find('交')+1:house_info.find('套')])
                xiao_qu.chu_zu = int(house_info[house_info.rfind('|') + 1:house_info.rfind('套')])
                nian_dai = div.select('div.positionInfo')[0].get_text(strip=True)
                nian_dai = nian_dai[nian_dai.find('/') + 1: nian_dai.find('年')]
                if nian_dai.isdigit():
                    xiao_qu.nian_dai = int(nian_dai)
                else:
                    xiao_qu.nian_dai = 0
                average_price = li.find('div', attrs={'class', 'totalPrice'}).get_text(strip=True)
                average_price = average_price[:average_price.find('元')]
                if average_price.isdigit():
                    xiao_qu.average_price = float(average_price)
                zai_shou = li.find('div', attrs={'class', 'xiaoquListItemSellCount'}).find('a').get_text(strip=True)
                xiao_qu.zai_shou = int(zai_shou[:zai_shou.find('套')])
                xiao_qu_list.append(xiao_qu)
        return xiao_qu_list
Exemple #11
0
 def __init__(self, city=None):
     self.lian_jia_session = LianJiaSession(city)
     self.__yaml_data = self.lian_jia_session.get_prop()
     self.__house_list = []
     self.__logger = self.lian_jia_session.get_logger()
     self.sql_session = self.lian_jia_session.get_sql_session()