コード例 #1
0
ファイル: pipelines.py プロジェクト: liudebin/lianjiaershou
    def process_item(self, item, spider):
        dbObject = mysqlHadler().dbHandle()
        cursor = dbObject.cursor()
        cursor.execute("USE test")
        # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)"
        sql = "INSERT INTO  ershoufang_detail_lianjia(title,link,house_code," \
              "community_code,community_link,community,town_link," \
              "town_disc, info,chan_quan,follow_info,tags,price,price_content," \
              "uni_price_content, rent_status) " \
              "VALUES(%s,%s,%s,%s,%s,%s, %s,%s,%s,%s,%s,%s,%s,%s,%s, 0)" \
              " on duplicate key update rent_status = 2 "
        # sql1 = "insert rent_scrapy_tmp(house_code) values(%s) "
        try:
            cursor.execute(
                sql, (item['title'], item['link'], item['house_code'],
                      item['community_code'], item['community_link'],
                      item['community'], item['town_link'], item['town_disc'],
                      item['house_info'], item['chan_quan'],
                      item['follow_info'], item['tags'], item['price'],
                      item['price_content'], item['unit_price']))
            cursor.connection.commit()
        except BaseException as e:
            print("ERROR>>>>>>>>>>>>>", e, "<<<<<<<<<<<<<ERROR")
            dbObject.rollback()

        # cursor.execute(sql1, item['house_code'])
        # cursor.connection.commit()
        return item
コード例 #2
0
 def process_item(self, item, spider):
     dbObject = mysqlHadler().dbHandle()
     cursor = dbObject.cursor()
     cursor.execute("USE test")
     # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)"
     sql = "update  rent_detail_lianjia set price_content = %s, tags= %s,house_type= %s," \
           " sub_way= %s,house_comment= %s,upload_date= %s,square= %s,direction= %s,base_info= %s ," \
           " community_code = %s, community = %s,  community_link = %s" \
           " , rent_status = CASE WHEN rent_status = -1 THEN 0 ELSE rent_status END" \
           " where house_code = %s"
     # sql = "update rent_detail_lianjia set community_code = %s, community = %s,  community_link = %s where house_code = %s"
     try:
         cursor.execute(
             sql,
             (item['price_content'], item['tags'], item['house_type'],
              item['sub_way'], item['house_comment'], item['upload_date'],
              item['square'], item['direct'], item['base_info_str'],
              item['community_code'], item['community'],
              item['community_link'], item['house_code']))
         # cursor.execute(sql, (item['community_code'], item['community'], item['community_link'], item['house_code']))
         cursor.connection.commit()
     except BaseException as e:
         print("ERROR>>>>>>>>>>>>>", e, "<<<<<<<<<<<<<ERROR")
         dbObject.rollback()
     return item
コード例 #3
0
class ToScrapeSpiderXPath(scrapy.Spider):
    name = 'lianjia'

    dbObject = mysqlHadler().dbHandle()
    cursor = dbObject.cursor()
    cursor.execute("USE test")
    # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)"
    sql = "update rent_detail_lianjia set rent_status = -1 where rent_status <> -1 ;"
    # sql = "delete from rent_scrapy_tmp;"
    cursor.execute(sql)
    cursor.connection.commit()

    
    box = []
    for num in range(102):
        pages = 'https://sh.lianjia.com/zufang/pudong/pg{0}/#contentList'.format(num)
        box.append(pages)
    start_urls = box

    # start_urls = [
    #     'https://sh.lianjia.com/zufang/pudong',
    # ]

    # dbObject = mysqlHadler().dbHandle()
    # cursor = dbObject.cursor()
    # cursor.execute("USE test")
    # # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)"
    # sql = "select distinct community_link link from rent_detail_lianjia where community_code is not null ;"
    # cursor.execute(sql)
    # results = cursor.fetchall()
    # box = []
    # for row in results:
    #     box.append(row[0])
    # start_urls = box

    def parse(self, response):
        print(response)
        for quote in response.xpath('//div[@class="content__list--item"]'):
            price = int(quote.xpath('.//span[@class="content__list--item-price"]/em/text()')[0].extract())
            # print(quote)
            yield {
                'title': getList(quote.xpath('.//div/p[@class="content__list--item--title twoline"]/a/text()')),
                'link': baseUrl + getList(quote.xpath('.//div/p[@class="content__list--item--title twoline"]/a/@href')),
                'house_code': getList(quote.xpath('.//div/p[@class="content__list--item--title twoline"]/a/@href'))[
                              8:-5],
                'area_link': baseUrl + getList(
                    quote.xpath('.//div/p[@class="content__list--item--des"]/a[position()=1]/@href')),
                'area_disc': getList(
                    quote.xpath('.//div/p[@class="content__list--item--des"]/a[position()=1]/text()')),
                'town_link': baseUrl + getList(
                    quote.xpath('.//div/p[@class="content__list--item--des"]/a[position()=2]/@href')),
                'town_disc': getList(
                    quote.xpath('.//div/p[@class="content__list--item--des"]/a[position()=2]/text()')),
                'price': price
            }
コード例 #4
0
    def process_item(self, item, spider):
        dbObject = mysqlHadler().dbHandle()
        cursor = dbObject.cursor()
        cursor.execute("USE test")
        # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)"
        sql = "INSERT INTO  rent_detail_lianjia(title,link,house_code," \
              "area_link,area_disc,town_link," \
              "town_disc, price, rent_status) " \
              "VALUES(%s,%s,%s,%s,%s,%s,%s, %s, 0) on duplicate key update rent_status = 2 "
        sql1 = "insert rent_scrapy_tmp(house_code) values(%s) "
        try:
            cursor.execute(
                sql, (item['title'], item['link'], item['house_code'],
                      item['area_link'], item['area_disc'], item['town_link'],
                      item['town_disc'], item['price']))
            cursor.connection.commit()
        except BaseException as e:
            print("ERROR>>>>>>>>>>>>>", e, "<<<<<<<<<<<<<ERROR")
            dbObject.rollback()

        # cursor.execute(sql1, item['house_code'])
        # cursor.connection.commit()
        return item
コード例 #5
0
class ToScrapeSpiderXPath(scrapy.Spider):
    name = 'lianjiaDetail'
    dbObject = mysqlHadler().dbHandle()
    cursor = dbObject.cursor()
    cursor.execute("USE test")
    # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)"
    sql = "select link from rent_detail_lianjia where community_code is null ;"
    cursor.execute(sql)
    results = cursor.fetchall()
    box = []
    for row in results:
        box.append(row[0])
    # start_urls = [
    #     'https://sh.lianjia.com/zufang/SH2135972204813492224.html'
    # ]
    start_urls = box

    def parse(self, response):
        # print response
        subtitile = response.xpath('//div[@class="content__subtitle"]')[
            0].xpath('string(.)').extract()[0].strip()
        upload_date = subtitile[12:22]
        code = response.xpath('//div[@class="content__subtitle"]')[0].xpath(
            ".//i[position()=2]/text()")[0].extract()
        house_code = code[5:]

        content__aside = response.xpath(
            './/div[@class="content__aside fr"]')[0]
        price_content = content__aside.xpath(
            './/div[@class="content__aside--title"]')[0].xpath(
                'string(.)').extract()
        price = int(
            content__aside.xpath(
                './/div[@class="content__aside--title"]/span/text()')
            [0].extract())

        tags = ""
        for quote in content__aside.xpath(
                './/p[@class="content__aside--tags"]/i'):
            tags = tags + "/" + quote.xpath('string(.)').extract()[0]

        content__article__table = \
            content__aside.xpath('//ul[@class="content__aside__list"]')[0]

        house_type_content_span = content__article__table.xpath(
            './/li[position()=2]/span')[0].xpath('string(.)').extract_first()
        house_type_content = content__article__table.xpath(
            './/li[position()=2]')[0].xpath('string(.)').extract_first()
        house_type_content1 = house_type_content.replace(
            house_type_content_span, "")
        house_type = house_type_content1.split()[0]

        square = house_type_content1.split()[1]

        direct = content__article__table.xpath('.//li[position()=3]')[0].xpath(
            'string(.)').extract()

        base_info = response.xpath(
            './/div[@class="content__article__info"]')[0]
        base_info_str = ""
        for quote in base_info.xpath('.//ul/li[@class="fl oneline"]'):
            base_info_str = base_info_str + quote.xpath(
                "string(.)").extract()[0] + "\n"

        houseComment = response.xpath(
            './/p[@data-el="houseComment"]/attribute::data-desc')
        house_comment = ""
        if houseComment:
            house_comment = houseComment[0].extract()

        sub_way = ""
        for quote in response.xpath(
                './/div[@class="content__article__info4"]/ul/li'):
            sub_way = sub_way + quote.xpath(
                './/span[position()=1]/text()')[0].extract()
            sub_way = sub_way + "- " + quote.xpath(
                './/span[position()=2]/text()')[0].extract() + "\n"

        community_link = baseUrl + response.xpath(
            '//div[@class="bread__nav w1150 bread__nav--bottom"]/h1/a/@href'
        )[0].extract()
        community_code = response.xpath(
            '//div[@class="bread__nav w1150 bread__nav--bottom"]/h1/a/@href'
        )[0].extract()[8:-1]
        community = response.xpath(
            '//div[@class="bread__nav w1150 bread__nav--bottom"]/h1/a/text()'
        )[0].extract()[:-2]

        yield {
            'house_code': house_code,
            'price_content': price_content,
            'tags': tags,
            'house_type': house_type,
            'sub_way': sub_way,
            'house_comment': house_comment,
            'upload_date': upload_date,
            'square': square,
            'direct': direct,
            'base_info_str': base_info_str,
            'community_code': community_code,
            'community': community,
            'community_link': community_link
        }
コード例 #6
0
class ToScrapeSpiderXPath(scrapy.Spider):
    name = 'lianjiaCommunity'

    # box = []
    # for num in range(102):
    #     pages = 'https://sh.lianjia.com/zufang/pudong/pg{0}/#contentList'.format(num)
    #     box.append(pages)
    # start_urls = box

    # start_urls = [
    #     'https://sh.lianjia.com/zufang/pudong',
    # ]

    dbObject = mysqlHadler().dbHandle()
    cursor = dbObject.cursor()
    cursor.execute("USE test")
    # sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)"
    sql = "select distinct community_code link from ershoufang_detail_lianjia where community_code is not null ;"
    cursor.execute(sql)
    results = cursor.fetchall()
    box = []
    for row in results:
        box.append("https://sh.lianjia.com/ershoufang/c" + row[0])
    start_urls = box

    def parse(self, response):
        for quote in response.xpath('//li [@class="clear LOGCLICKDATA"]'):
            title = getList(
                quote.xpath(
                    './/div[@class="info clear"]/div[@class="title"]/a/text()')
            )
            link = getList(
                quote.xpath('.//a[@class="noresultRecommend img "]/@href'))
            house_code = getList(
                quote.xpath(
                    './/a[@class="noresultRecommend img "]/@data-housecode'))
            community_code = getList(
                quote.xpath(
                    './/div[@class="info clear"]/div[@class="priceInfo"]/div[@class="unitPrice"]/@data-rid'
                ))

            community_link = getList(
                quote.xpath(
                    './/div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]/a/@href'
                ))
            community = getList(
                quote.xpath(
                    './/div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]/a/text()'
                ))

            town_link = getList(
                quote.xpath(
                    './/div[@class="info clear"]/div[@class="flood"]/div[@class="positionInfo"]/a/@href'
                ))
            town_disc = getList(
                quote.xpath(
                    './/div[@class="info clear"]/div[@class="flood"]/div[@class="positionInfo"]/a/text()'
                ))

            nn = quote.xpath(
                './/div[@class="info clear"]/div[@class="address"]/div[@class="houseInfo"]'
            )
            house_info = nn[0].xpath('string(.)')[0].extract().replace(
                "\n", "").replace("  ", "")
            chan_quan = quote.xpath('.//div[@class="info clear"]/div[@class="flood"]/div[@class="positionInfo"]')[0] \
                            .xpath('string(.)')[0].extract().replace("\n", "").replace("  ", ""),

            follow_info = quote.xpath('.//div[@class="info clear"]/div[@class="followInfo"]')[0].xpath(
                'string(.)')[0].extract() \
                .replace("\n", "").replace("  ", "")
            tags = quote.xpath('.//div[@class="info clear"]/div[@class="tag"]')[0].xpath('string(.)')[0].extract() \
                .replace("\n", "").replace("  ", "")
            price = int(
                quote.xpath(
                    './/div[@class="info clear"]/div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()'
                )[0].extract())
            price_content = quote.xpath('.//div[@class="info clear"]/div[@class="priceInfo"]/div[@class="totalPrice"]')[
                0] \
                .xpath('string(.)')[0].extract().replace("\n", "").replace("  ", "")
            unit_price = getList(
                quote.xpath(
                    './/div[@class="info clear"]/div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()'
                ))

            yield {
                'title': title,
                'link': link,
                'house_code': house_code,
                'community_code': community_code,
                'community_link': community_link,
                'community': community,
                'town_link': town_link,
                'town_disc': town_disc,
                'house_info': house_info,
                'chan_quan': chan_quan,
                'follow_info': follow_info,
                'tags': tags,
                'price': price,
                'price_content': price_content,
                'unit_price': unit_price
            }
コード例 #7
0
# coding=utf-8
from msysqlDb import mysqlHadler

dbObject = mysqlHadler().dbHandle()
cursor = dbObject.cursor()
cursor.execute("USE test")
# sql = "INSERT INTO articles(author,title,times,url,admire,likes) VALUES(%s,%s,%s,%s,%s,%s)"
sql = "select link from rent_detail_lianjia ;"
try:
    cursor.execute(sql)
    results = cursor.fetchall()
    # print results
    for row in results:
        link = row[0]

        # 打印结果
        print "fname=%s" % \
              (link)
except BaseException as e:
    print("ERROR>>>>>>>>>>>>>", e, "<<<<<<<<<<<<<ERROR")