Beispiel #1
0
    def parse_detail(self, response):
        self.log('this is an item page! %s' % response.url)
        item = self.get_new_item(response)
        item['json']['date'] = response.meta['date']
        item['json']['brand_name'] = response.meta['brand_name']
        item['json']['product_description'] = response.meta[
            'product_description']
        item['json']['reason_problem'] = response.meta['reason_problem']
        item['json']['company'] = response.meta['company']
        sourceurl = response.meta["source_url"]

        article = response.xpath("//article").extract()[0].strip()
        item['json']['title'] = response.xpath(
            "//title/text()").extract()[0].strip()
        lxmlTree = fromstring(article)
        if lxmlTree.xpath("//article/h3[1]"):
            dangerTitle = lxmlTree.xpath("//article/h3[1]")[0]
            dangerTitle.getparent().remove(dangerTitle)
        if lxmlTree.xpath("//article/p[1]"):
            dangerContent = lxmlTree.xpath("//article/p[1]")[0]
            dangerContent.getparent().remove(dangerContent)
        title = []
        if 'title' not in item and lxmlTree.xpath("//article/h1"):
            h1_tmp = tostring(lxmlTree.xpath("//article/h1")[0],
                              encoding="UTF-8")
            h1_start = h1_tmp.find('>') + 1
            h1_end = h1_tmp.find('</')
            item['json']['title'] = h1_tmp[h1_start:h1_end]
            title = lxmlTree.xpath("//h1")[0]
            title.getparent().remove(title)
        elif 'title' not in item and lxmlTree.xpath("//article/h2"):
            h2_tmp = tostring(lxmlTree.xpath("//article/h2")[0],
                              encoding="UTF-8")
            h2_start = h2_tmp.find('>') + 1
            h2_end = h2_tmp.find('</')
            item['json']['title'] = h2_tmp[h2_start:h2_end]
            title = lxmlTree.xpath("//article/h2")[0]
            title.getparent().remove(title)
        elif 'title' not in item and lxmlTree.xpath("//article/h3"):
            h3_tmp = tostring(lxmlTree.xpath("//article/h3")[0],
                              encoding="UTF-8")
            h3_start = h3_tmp.find('>') + 1
            h3_end = h3_tmp.find('</')
            item['json']['title'] = h3_tmp[h3_start:h3_end]
            title = lxmlTree.xpath("//article/h3")[0]
            title.getparent().remove(title)
        else:
            pass
        item_url = ''
        for element in lxmlTree.xpath("//article/p"):
            text = ''
            if element.xpath(".//text()"):
                text = element.xpath(".//text()")[0].strip()
            if 'Photo:' in text:
                item_url = urlparse.urljoin(
                    response.url,
                    element.xpath(".//a/@href")[0].strip())
            if not text \
                or text == '###' \
                or 'RSS Feed for FDA Recalls Information' in text \
                or 'Photo: Product Labels' in text \
                    or 'Recalled Product Photos Are Also Available on FDA' in text:
                element.getparent().remove(element)

        contentProcess = tostring(lxmlTree, encoding="UTF-8")
        spSelector = Selector(text=contentProcess)
        item['json']['content'] = spSelector.xpath("//article").extract()[0]
        item['html'] = response.body
        htmls_path = {sourceurl: response.body}
        item["htmls_path"] = htmls_path
        item['source_url'] = sourceurl
        if item_url:
            yield scrapy.Request(item_url,
                                 callback=self.parse_item,
                                 dont_filter=True,
                                 meta={
                                     'item': item,
                                     "item_url": item_url
                                 })
        else:
            item['image_urls'] = []
            yield item
Beispiel #2
0
sql = 'select url from baidu_xinshao'
cursor.execute(sql)
data = cursor.fetchall()
print(len(data))
for i in range(len(data)):
    try:
        print('{0}第{1}个页面爬取{0}'.format('-' * 20, i))
        x = requests.get(
            data[i][0],
            headers={
                'User-Agent':
                'Mozilla/5.0 (X11; Linux x86_64) '
                'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
            })
        x.encoding = 'utf-8'
        page = Selector(text=x.text)
        title1 = page.xpath('/html/body')

        title = (re.sub("[^\u4e00-\u9fa5]", '',
                        title1.xpath('string(.)').extract_first()))
        # print(title)
        cursor.execute("insert data(url,page) VALUES('{0}','{1}')".format(
            data[i][0], title))
        conn.commit()
    except Exception as e:
        print(e)
#
# print(data)
cursor.close()

#
Beispiel #3
0
    def parse_content(self, response):
        item = ChallengsItem()
        sel2 = Selector(response)

        try:

            imgurls = sel2.xpath(r'//div[@class="in"]/img/@data-src').extract()
            if not imgurls:
                image_urls_a = ['']
                list = ['']

            else:
                image_urls_a = imgurls
                listimg = []
                list = []
                for imgurl in imgurls:
                    picname = imgurl.split('/')[-1]
                    list.append(picname)
            item['category'] = ['economie']
            desf = sel2.xpath('//div[@class="article-start"]/p[2]').extract()
            print desf
            if not desf:
                list1 = ['']
            else:
                for dea in desf:
                    tj = '<a href="'
                    if tj in dea:
                        dess1 = re.sub('<a.*?">', '', tj)
                        des = re.sub('</a>', '', dess1)
                    else:
                        des = desf
                a1 = ''.join(des)
                list1 = []
                list1.append(a1)
            now = datetime.datetime.now()
            bb = now.strftime('%Y-%m-%d')
            cc = str(bb)
            listd = []
            listd.append(cc)
            item['datime'] = listd
            title = sel2.xpath('//h1[@itemprop="headline"]/text()').extract()
            for tit in title:
                tita = tit
            item['title'] = title
            #item_title = ''.join(title)
            data = sel2.xpath('//div[@itemprop="articlebody"]').extract()

            if '' in list1:
                item['des'] = ['']
                #item_des = ['']

            else:
                item['des'] = list1
                #item_des = list1

            if '' in list:
                item['img'] = ['']
                #item_img = ['']

            else:
                imgl = []
                for imga in list:
                    imgadress = 'http://www.actualites-les.com/static/images/ch/' + imga
                    img = '<img src="' + imgadress + '" alt="' + tita + '">'
                    a = ''.join(img)
                    imgl.append(a)
                item['img'] = imgl
                #item_img = imgl
            ilist = []
            for da in data:
                body = da
                tj = '<a href="'
                tj2 = '</iframe>'
                tj3 = 'data-uri='
                tj4 = '<div class="right">'
                tj5 = '<li class="item"'
                tj6 = '<script async src='
                tj7 = '<img alt='
                tj8 = '</ul></div></div></div>'
                datas1 = re.sub('<a.*?">', '', da)
                datas2 = re.sub('</a>', '', datas1)
                if tj in body:
                    data1 = datas2
                else:
                    data1 = body
                if tj2 in data1:
                    data2 = re.sub(r'<iframe.*?</iframe>', '', data1)
                else:
                    data2 = data1

                if tj3 in data2:
                    tj3a = re.sub(
                        '<div class="article-diaporama diapo-micro".*?</div>',
                        '', data2)
                    tj8a = re.sub(
                        '<div class="article-diaporama diapo-micro".*?</ul></div></div></div>',
                        '', data2)
                    if not tj8a:
                        data3 = tj3a
                    else:
                        data3 = tj8a
                else:
                    data3 = data2
                if tj4 in data3:
                    data4 = re.sub('<div class="right">.*?</div>', '', data3)
                else:
                    data4 = data3
                if tj5 in data4:
                    data5 = re.sub('<ul><li class="item".*?</div>', '', data4)
                else:
                    data5 = data4
                if tj6 in data5:
                    data6 = re.sub('<script.*?</script>', '', data4)
                else:
                    data6 = data5
                if tj7 in data6:
                    imgurl = re.findall('<img alt=.*?src="(.*?)".*?>', data6)
                    image_urls2 = imgurl
                    #item['image_urls'] = ['']
                    if not image_urls2:
                        ilist = ['']
                    else:
                        ilist = image_urls2

                    list2img = []
                    for img2 in imgurl:
                        picname2 = img2.split('/')[-1]
                        listar = []
                        picurl2 = 'http://www.actualites-les.com/static/images/ch' + picname2
                        namepic = '<img alt="' + tita + '"' + 'src="' + picurl2 + '">'
                        data7 = re.sub('<img alt=.*?">', namepic, data6)
                        article = ''.join(data7)
                        listar.append(article)
                        item['article'] = listar
                        # print '------这个是有条件7的------', item['article']
                else:
                    data7 = data6
                    listar = []
                    article = ''.join(data7)
                    listar.append(article)
                    item['article'] = listar
                    # print '这个是没有条件7的',item['article']
            item['image_urls'] = image_urls_a + ilist
            yield item

        except Exception as e:
            print '内容解析错误原因:', e
Beispiel #4
0
def data_shuffle(data):
    re_data = dict()
    re_data["ENTITY_CODE_"] = data["ENTITY_CODE_"]
    re_data["ENTITY_NAME_"] = data["ENTITY_NAME_"]
    re_data["URL_"] = data["URL_"]
    # 年费
    re_data["FEE_"] = data["FEE_"]
    # 提现额
    re_data["CASHING_AMOUNT_"] = data["CASHING_AMOUNT_"]
    # 信用额(最高)
    re_data["MOST_AMOUNT_"] = data["MOST_AMOUNT_"]
    # 卡等级
    re_data["CARD_LEVEL_"] = data["CARD_LEVEL_"]
    # 卡组织
    re_data["CARD_ORG_"] = data["CARD_ORG_"]
    # 卡片IMAGE
    if "IMG_" in data:
        image_url = data["IMG_"]
        response = req_for_something(url=image_url)
        if response:
            t = base64.b64encode(response.content)
            re_data["IMG_"] = t.decode("utf-8")
    # 卡片名称
    re_data["CARD_NAME_"] = data["CARD_NAME_"]
    # 权益(文字描述)
    re_data["POWER_WRITING_"] = data["POWER_WRITING_"]
    # 卡属性
    re_data["CARD_ATTR_"] = data["CARD_ATTR_"]
    # 信用额度
    re_data["CREDIT_AMOUNT_"] = data["CREDIT_AMOUNT_"]
    # 免息期
    re_data["INTEREST_FREE_"] = data["INTEREST_FREE_"]
    # 详细介绍
    INTRO_ = BeautifulSoup(data["INTRO_"], "html.parser").getText()
    pattern = re.compile(r"[\s\S]*卡片介绍([\s\S]*)")
    if re.match(pattern, INTRO_):
        a = re.match(pattern, INTRO_)
        intro = a.group(1)
        intro = re.sub('[\n]+', '', intro)
        re_data["INTRO_"] = intro

    # 卡片介绍
    # print(data["CARD_INTRO_"])
    soup = BeautifulSoup(data["CARD_INTRO_"], "html.parser")
    re_data["CARD_INTRO_"] = soup.find('div', {"class": "adp"}).text
    # pattern = re.compile(r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)")

    # OTHER_REPAY_其他还款
    soup = BeautifulSoup(data["OTHER_REPAY_"], "html.parser")
    # print(soup)
    a = soup.find_all('div', {"class": "tt2_1"})
    OTHER_REPAY_LIST = list()
    for item in a:
        OTHER_REPAY_LIST.append(item.string)
    OTHER_REPAY_ = "|".join(OTHER_REPAY_LIST)
    re_data["OTHER_REPAY_"] = OTHER_REPAY_

    # OFFLINE_REPAY_ 网点还款
    soup = BeautifulSoup(data["OFFLINE_REPAY_"], "html.parser")
    a = soup.find_all('div', {"class": "tt2_1"})
    OFFLINE_REPAY_LIST = list()
    for item in a:
        OFFLINE_REPAY_LIST.append(item.string)
    OFFLINE_REPAY_ = "|".join(OFFLINE_REPAY_LIST)
    re_data["OFFLINE_REPAY_"] = OFFLINE_REPAY_

    # NET_REPAY_ 在线还款
    soup = BeautifulSoup(data["NET_REPAY_"], "html.parser")
    a = soup.find_all('div', {"class": "tt2_1"})
    NET_REPAY_LIST = list()
    for item in a:
        NET_REPAY_LIST.append(item.string)
        NET_REPAY_ = "|".join(NET_REPAY_LIST)
    re_data["NET_REPAY_"] = NET_REPAY_

    # ACTIVATE_ 激活
    re_data["ACTIVATE_"] = data["ACTIVATE_"]

    # SCORE_MILEAGE_ 积分兑换里程
    SCORE_MILEAGE_ = BeautifulSoup(data["SCORE_MILEAGE_"],
                                   "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, SCORE_MILEAGE_):
        a = re.match(pattern, SCORE_MILEAGE_)
        score_mileage = a.group(1)
        score_mileage = re.sub('[\n]+', '', score_mileage)
        score_mileage = re.sub('\s+', '', score_mileage)
        re_data["SCORE_MILEAGE_"] = score_mileage

    # SCORE_METHOD_ 积分兑换方法
    SCORE_METHOD_ = BeautifulSoup(data["SCORE_METHOD_"],
                                  "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, SCORE_METHOD_):
        a = re.match(pattern, SCORE_METHOD_)
        score_method = a.group(1)
        score_method = re.sub('[\n]+', '', score_method)
        score_method = re.sub('\s+', '', score_method)
        re_data["SCORE_METHOD_"] = score_method

    # SCORE_SEARCH_ 积分查询方式
    SCORE_SEARCH_ = BeautifulSoup(data["SCORE_SEARCH_"],
                                  "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, SCORE_SEARCH_):
        a = re.match(pattern, SCORE_SEARCH_)
        score_search = a.group(1)
        score_search = re.sub('[\n]+', '', score_search)
        score_search = re.sub('\s+', '', score_search)
        re_data["SCORE_SEARCH_"] = score_search

    # SCORE_ACCU_ 积分累积规则
    SCORE_ACCU_ = BeautifulSoup(data["SCORE_ACCU_"], "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, SCORE_ACCU_):
        a = re.match(pattern, SCORE_ACCU_)
        score_accu = a.group(1)
        score_accu = re.sub('[\n]+', '', score_accu)
        score_accu = re.sub('\s+', '', score_accu)
        re_data["SCORE_ACCU_"] = score_accu

    # SCORE_VALID_ 积分有效期
    SCORE_VALID_ = BeautifulSoup(data["SCORE_VALID_"], "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, SCORE_VALID_):
        a = re.match(pattern, SCORE_VALID_)
        score_valid = a.group(1)
        score_valid = re.sub('[\n]+', '', score_valid)
        score_valid = re.sub('\s+', '', score_valid)
        re_data["SCORE_VALID_"] = score_valid

    # PREPAYMENT_ 提前还款规定
    PREPAYMENT_ = BeautifulSoup(data["PREPAYMENT_"], "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, PREPAYMENT_):
        a = re.match(pattern, PREPAYMENT_)
        repayment = a.group(1)
        repayment = re.sub('[\n]+', '', repayment)
        repayment = re.sub('\s+', '', repayment)
        re_data["PREPAYMENT_"] = repayment

    # CHARE_DEDUCT_ 手续费扣除方式
    CHARE_DEDUCT_ = BeautifulSoup(data["CHARE_DEDUCT_"],
                                  "html.parser").getText()
    pattern = re.compile(
        r"[\s\S]*内容页\*/[\s\S]*\.link-hover{color:#0066cc; border-bottom:1px dashed #ccc;}([\s\S]*)"
    )
    if re.match(pattern, CHARE_DEDUCT_):
        a = re.match(pattern, CHARE_DEDUCT_)
        chage_deduct = a.group(1)
        chage_deduct = re.sub('[\n]+', '', chage_deduct)
        chage_deduct = re.sub('\s+', '', chage_deduct)
        re_data["CHARE_DEDUCT_"] = chage_deduct

    # NUMBER_RATE_  期数及费率
    # print(data["NUMBER_RATE_"])
    from scrapy.selector import Selector
    import requests
    response = requests.get(
        data['URL_'],
        headers={
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
        })
    html = Selector(text=response.content.decode('gb2312'))
    trs = html.xpath(
        '//div[@id="fwq1"]//table[@class="MsoNormalTable"]//tr[position()>5 and position()<last()-1]'
    )
    for tr in trs:
        try:
            page = trs.index(tr) + 6
            xpath_ = f'//div[@id="fwq1"]//table[@class="MsoNormalTable"]//tr[{page}]'
            periods_1 = tr.xpath(xpath_ +
                                 '/td[1]/p/span[1]/text()').extract()[0]
            rate_1 = tr.xpath(xpath_ +
                              '//td[1]/p/span[2]/text()').extract()[-1]

            periods_2 = tr.xpath(xpath_ +
                                 '/td[2]/p/span[1]/text()').extract()[0]
            rate_2 = tr.xpath(xpath_ + '/td[2]/p/span[2]/text()').extract()[-1]
        except:
            periods_1, rate_1, periods_2, rate_2 = '', '', '', '',
        print(periods_1, rate_1, periods_2, rate_2)
    return re_data
    def parse(self, response):
        logging.info("***********************************")
        # these two vals are for callback args
        #   that we passed in Request(next_page_url)
        write_title = ( response.meta['write_title'] \
                        if ('write_title' in response.meta) else True )
        start_row = ( response.meta['start_row'] \
                        if ('start_row' in response.meta) else 0 )
        crawld_pages = ( response.meta['crawld_pages'] \
                        if ('crawld_pages' in response.meta) else 0 )
        logging.info("write_title: %s" % write_title)
        logging.info("start_row %s" % start_row)
        logging.info("crawld_pages %s" % crawld_pages)

        # using browser to get url again ...
        logging.info("***********************************")
        yield Request(response.url, headers={'User-Agent': ""})
        self.browser.get(response.url)
        self.logging.info("#### got url ...")
        self.logging.info("#### response: %s" % response)

        browser_response = Selector(text=self.browser.page_source)
        #browser_response = response
        logging.info("type(browser_response): %s" % type(browser_response))

        # broswer is ready now ...
        self.logging.info("begin the logic of parse method ... ")

        # crawl data of this page
        # start here
        logging.info("start crawling data of this page ...")

        row_index = start_row

        # for the title of the table
        if write_title:
            for col in self.rule["columns"]:
                if self.rule["columns"][col]["title"] != "None":
                    data = browser_response.xpath(
                        self.rule["columns"][col]["title"]).extract()
                    #print "data: ", data
                    self.worksheet.write(row_index, int(col) - 1, data)
            row_index += 1

        # for the content of the table
        for select in browser_response.xpath(self.rule["table_tag"]):
            #logging.info("select: %s" % select)
            #logging.info("self.rule[\"columns\"]: %s" % self.rule["columns"])
            for col in self.rule["columns"]:
                data = select.xpath(
                    self.rule["columns"][col]["content"]).extract()
                #logging.info("data: %s" % data)
                self.worksheet.write(row_index, int(col) - 1, data)
            row_index += 1

        crawld_pages += 1
        logging.info("finished crawling data of this page ...")

        # next_page need to be crawl ...
        # then do it

        # For USNEWS only:
        #   show start and end page number in url to replace NUM
        #   IN CONFIG:
        #       "next_page": [
        #           "URL_PATTERN",
        #           ["url_pattern", "start_page", "end_page"]
        #       ],
        if self.next_page[0] == "URL_PATTERN":
            if self.next_page[1] and (crawld_pages < int(
                    self.next_page[1][2])):
                # replace NUM in url_pattern with page_number in range
                # AND assume that NUM in url_pattern here
                next_url = self.next_page[1][0].replace(\
                                                "NUM", str(crawld_pages+1))
                request = yield scrapy.Request(
                    next_url,
                    callback=self.parse,
                    headers={
                        'User-Agent':
                        "Windows;U;Windows NT 6.1;en-US;rv;1.9.1.6"
                    })
                request.meta['write_title'] = False
                request.meta['start_row'] = row_index
                request.meta['crawld_pages'] = crawld_pages
                #return request
            else:
                logging.info("No more next_page to crawl ...")
                logging.info("I will quit my parse here ... Thanks ...")

        self.logging.info("Finish the logic of parse method ... ")
Beispiel #6
0
 def parse_with_rules(self, response, rules, item_class):
     return self.dfs(Selector(response), rules, item_class)
Beispiel #7
0
    def parse_product_list(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        product_nodes = sel.xpath(
            '//div[@id="resultsCol"]//div[contains(@id, "result_")]')
        for node in product_nodes:

            see_all_node = node.xpath('.//li[@class="seeAll"]/a[@href]')
            if see_all_node:
                try:
                    see_all_href = see_all_node.xpath('./@href').extract()[0]
                    see_all_href = self.process_href(see_all_href,
                                                     response.url)
                    if see_all_href:
                        ms = copy.deepcopy(metadata)

                        yield Request(url=see_all_href,
                                      callback=self.parse_product_list,
                                      errback=self.onerr,
                                      meta={'userdata': ms})
                except (TypeError, IndexError):
                    pass

            try:
                href = node.xpath('.//a[@href]/@href').extract()[0]
                href = self.process_href(href, response.url)
            except (TypeError, IndexError):
                continue

            m = copy.deepcopy(metadata)

            category_node = node.xpath('.//span[@class="bold orng"][text()]')
            if category_node:
                try:
                    category_text = category_node.xpath(
                        './text()').extract()[0]
                    category_text = self.reformat(category_text)
                    # 去掉:
                    mt = re.search(ur'([^:]+)', category_text)
                    if mt:
                        category_text = mt.group(1)
                    category_name = category_text.lower()

                    if category_name and category_text:
                        m['tags_mapping']['category-0'] = [{
                            'name':
                            category_name,
                            'title':
                            category_text
                        }]
                except (TypeError, IndexError):
                    pass

            yield Request(url=href,
                          callback=self.parse_product,
                          errback=self.onerr,
                          meta={'userdata': m})

        next_node = sel.xpath(
            '//div[@id="centerBelowMinus"]//div[@id="pagn"]//a[@id="pagnNextLink"][@href]'
        )
        if next_node:
            try:
                next_href = next_node.xpath('./@href').extract()[0]
                next_href = self.process_href(next_href, response.url)

                yield Request(url=next_href,
                              callback=self.parse_product_list,
                              errback=self.onerr,
                              meta={'userdata': metadata})
            except (TypeError, IndexError):
                pass
Beispiel #8
0
 def parse(self, response):
     body = Selector(text=response.body)
     item_url = response.request.url
     self.save(body, item_url)
 def parse_item(self, response):
 	sel = Selector(response)
     product_links = sel.xpath(self.configs['product_links']);
     for pl in product_links:
         url = response.urljoin(pl.extract());
         yield scrapy.Request(url, callback = self.parse_detail_content)
Beispiel #10
0
 def setUp(self):
     self.selector = Selector(text=post_item, type="html")
Beispiel #11
0
 def setUp(self):
     self.selector = Selector(text=article, type="html")
Beispiel #12
0
 def setUp(self):
     self.selector = Selector(text=header, type="html")
 def parse(self, response):
     sel = Selector(response)
     elements = sel.xpath('//a[contains(@class, "tag")]/text()')
     page = Page()
     page['tags'] = [element.extract() for element in elements]
     return page
Beispiel #14
0
from scrapy.selector import Selector

body = open('tt.xml', 'r').read()

# print(body)
title1 = Selector(text=body).xpath('//ul[@class = "txt txt2"/li]').extract()
# print(title1)
Beispiel #15
0
def img_url_from_page(url):
    html = requests.get(url).text  # r = requests.get(url); html = r.text
    sel = Selector(text=html)
    img_names = sel.css('td a img::attr(src)').extract()
    img_names = [img_name for img_name in img_names]
    return img_names
Beispiel #16
0
    def parse_thread(self, response):
        logging.info(
            '############################# Start #####################################'
        )
        threadModel = ThreadItem()
        logging.info('-------------thread---------------')
        pattern = re.compile(r'\d+')
        thread = pattern.findall(response.url)[1]
        logging.info(thread)
        threadModel["threadId"] = thread
        logging.info(response.body)
        hxs = Selector(response)
        titles = hxs.xpath('//h1/a/text()').extract()
        logging.info('-------------titles---------------')
        for title in titles:
            logging.info(title)
        threadModel["titles"] = titles
        breadCrumbs = hxs.xpath('//*[@id="pt"]/div/a/text()').extract()
        breadCrumbs = breadCrumbs[2:len(breadCrumbs) - 1]
        threadModel["breadCrumbs"] = breadCrumbs
        locks = hxs.xpath('///div[@class="locked"]/text()').extract()
        if len(locks) > 0:
            logging.info('-------------locks---------------')
            for lock in locks:
                logging.info(lock)
        #files = hxs.xpath('//span[re:test(@id, "attach_\d*")]/a/@href').extract()
        showhides = hxs.xpath('//div[@class="showhide"]')
        files = []
        files = hxs.xpath('//div[@class="showhide"]//a/@href').extract()
        #for i, f in enumerate(files):
        #    if f == 'javascript:;':
        #        del files[i]

        hideTexts = showhides.xpath('text()').extract()

        for i in range(len(files) - 1, -1, -1):
            if files[i] == 'javascript:;':
                files.pop(i)

        if len(files) > 0:
            pass
        elif len(showhides.extract()) > 0 and len(hideTexts) > 0:
            hideText = hideTexts[0].strip()
            if hideText.startswith('thunder') or hideText.startswith(
                    'ed2k') or hideText.startswith('magnet'):
                files.append(hideText)
        else:
            files = hxs.xpath(
                '//span[re:test(@id, "attach_\d*")]/a/@href').extract()

            if len(files) == 0:
                allLinkTag = hxs.xpath('//a')
                for linkTag in allLinkTag:
                    allText = linkTag.xpath('text()').extract()
                    if len(allText) > 0:
                        linkText = linkTag.xpath('text()').extract()[0]
                        if len(linkText) > 0:
                            linkText = linkText.strip()
                            if linkText.endswith('torrent'):
                                files.append(
                                    linkTag.xpath('@href').extract()[0])

        startStrings = ('thunder', 'ed2k', 'magnet', 'magnet',
                        'https://pan.baidu.com', 'http://pan.baidu.com',
                        'http://www.87lou.com/forum.php?mod=attachment',
                        'http://duwude.ctfile.com')

        allHref = hxs.xpath('//a/@href').extract()

        for link in allHref:
            if link.lower().startswith(startStrings):
                files.append(link)

        #func = lambda x,y:x if y in x else x + [y]

        #reduce(func, [[], ] + files)

        files = list(set(files))

        passwords = []
        if len(showhides.extract()) > 0:
            hideTexts = showhides.xpath('text()').extract()
            if len(hideTexts) > 0:
                for ht in hideTexts:
                    if u'\u5bc6\u7801' in ht:
                        passwords.append(ht.strip())

        if len(passwords) > 0:
            threadModel["passwords"] = passwords

        if len(files) > 0:
            logging.info('-------------breadCrumbs---------------')
            for breadCrumb in breadCrumbs:
                logging.info(breadCrumb)
            logging.info('-------------files---------------')
            threadModel["files"] = files
            for file in files:
                if file.strip().startswith(
                        'http://www.87lou.com/forum.php?mod=attachment'):
                    yield Request(file,
                                  meta={'thread': thread},
                                  callback=self.download)
                    pass
                logging.info(file)

        logging.info(
            '############################# End #####################################'
        )
        yield threadModel
 def parse_page(self, response):
     course = Selector(response)
     item = CourseraItem()
     item['title'] = course.xpath('//pre/a/text()').extract()
     item['mp4_url'] = course.xpath('//pre/text()[2]').extract()
     return item
Beispiel #18
0
    def parse(self, response):

        book = Selector(response)

        Info = book.xpath('//div[@class="info"]')

        for info in Info:
            item = XiaogongjuItem()

            item['title'] = ''.join(info.xpath('./h2/a/text()').extract())

            # 图书的价格、出版时间、作者、出版社等信息;如:[日] 东野圭吾 / 陈文娟 / 北京十月文艺出版社 / 2018-1 / 45
            pub = info.xpath(
                './div[@class="pub"]/text()').extract_first().strip()

            # 判断pub的长度,较精确地提取图书的价格,出版时间,出版社,作者,译者等信息
            if len(pub.split('/')) == 6:
                item['price'] = pub.strip().split('/')[-1]  # 图书价格
                item['pub_time'] = str(pub.split('/')[-4:-1]).replace(',','-').replace('[','').\
                    replace(']','').replace("'",'').replace(' ','')  # 图书出版时间,处理特殊格式的时间:1999/9/9
                item['publish'] = pub.strip().split('/')[-5]  # 图书出版社
                item['author'] = str(pub.strip().split('/')[0:2]).replace(',',' | 译者:').\
                    replace('[','').replace(']','').replace("'",'').replace(' ','')  # 作者、译者

            elif len(pub.split('/')) == 5:
                item['price'] = pub.strip().split('/')[-1]
                item['pub_time'] = pub.strip().split('/')[-2]
                item['publish'] = pub.strip().split('/')[-3]
                item['author'] = str(pub.strip().split('/')[0:2]).replace(',',' | 译者:').\
                    replace('[','').replace(']','').replace("'",'').replace(' ','')

            elif len(pub.split('/')) == 3:
                item['price'] = pub.strip().split('/')[-1]
                item['pub_time'] = pub.strip().split('/')[-2]
                item['publish'] = None
                item['author'] = str(pub.strip().split('/')[0])

            else:
                item['price'] = pub.strip().split('/')[-1]
                item['pub_time'] = pub.strip().split('/')[-2]
                item['publish'] = pub.strip().split('/')[-3]
                item['author'] = pub.strip().split('/')[0]

            item['star'] = ''.join(
                info.xpath(
                    './div[@class="star clearfix"]/span/text()').extract())

            yield scrapy.Request(
                url=info.xpath('./h2[@class=""]/a/@href').extract_first(),
                callback=self.parse_book_info,
                meta={'item': item})

        try:
            nextPage = book.xpath(
                '//div[@id="subject_list"]/div[@class="paginator"]/'
                'span[@class="next"]/a/@href').extract_first().strip()
            if nextPage:
                next_url = "https://book.douban.com%s" % nextPage
                yield scrapy.http.Request(next_url, callback=self.parse)

        except:
            pass
Beispiel #19
0
    def parse(self, response):
        tables = response.xpath('//table[@bgcolor="#808000"]')

        for table in tables:
            list_tr = table.xpath('tr').extract()
            tr = Selector(text=list_tr[0])

            item = DseMarketSummaryItem()

            item['date'] = tr.xpath('//font/text()').extract_first(
            )[len("Market Summary of "):].replace(",", "")
            item['date'] = datetime.strptime(item['date'], '%b %d %Y')

            tr = Selector(text=list_tr[1])

            list_td = tr.xpath('//td').extract()
            td = Selector(text=list_td[1])
            item['dsex_index'] = float(
                td.xpath('//font/text()').extract_first().replace(",", ""))

            td = Selector(text=list_td[3])
            item['total_trade'] = int(
                td.xpath('//font/text()').extract_first().replace(",", ""))

            tr = Selector(text=list_tr[2])

            list_td = tr.xpath('//td').extract()
            td = Selector(text=list_td[1])
            item['dsex_index_change'] = float(
                td.xpath('//font/text()').extract_first().replace(",", ""))
            td = Selector(text=list_td[3])
            item['total_value_taka_mn'] = float(
                td.xpath('//font/text()').extract_first().replace(",", ""))

            tr = Selector(text=list_tr[3])

            list_td = tr.xpath('//td').extract()
            td = Selector(text=list_td[1])
            item['ds30_index'] = float(
                td.xpath('//font/text()').extract_first().replace(",", ""))
            td = Selector(text=list_td[3])
            item['total_volume'] = int(
                td.xpath('//font/text()').extract_first().replace(",", ""))

            tr = Selector(text=list_tr[4])

            list_td = tr.xpath('//td').extract()
            td = Selector(text=list_td[1])
            item['ds30_index_change'] = float(
                td.xpath('//font/text()').extract_first().replace(",", ""))
            td = Selector(text=list_td[3])
            item['total_market_cap_taka_mn'] = float(
                td.xpath('//text()').extract_first().replace(",", ""))

            yield item
Beispiel #20
0
def get_content(url):

    html = get_html(url)

    dates = Selector(text=html).xpath('//div/ul[@class="week"]/li').extract()
    weas = Selector(text=html).xpath('//ul[@class="txt txt2"]/li').extract()
    tems =Selector(text=html).xpath('//div[@class="zxt_shuju"]/ul/li').extract()
    wind = Selector(text=html).xpath('//ul[@class="txt"]/li').extract()

    items = []
    for i in range(7):
        item = {}
        item['date'] = Selector(text=dates[i]).xpath('//b/text()').extract()[0]
        item['week'] = Selector(text=dates[i]).xpath('//span/text()').extract()[0]
        item['img'] = Selector(text=dates[i]).xpath('//img/@src').extract()[0]
        item['weather'] = Selector(text=weas[i]).xpath('//text()').extract()[0]
        temlow = Selector(text=tems[i]).xpath('//b/text()').extract()[0]
        temtop = Selector(text=tems[i]).xpath('//span/text()').extract()[0]
        item['temperature'] = temlow+'~~ '+temtop
        item['wind'] = Selector(text=wind[i]).xpath('//text()').extract()[0]
        items.append(item)
    print(items)
    def parse_file_page(self, response):
        #item passed from request
        item = response.meta['item']

        #selector
        sel = Selector(response)

        title = str(
            sel.xpath('//*[@class="full-address"]/text()').extract()[0])
        values = ['NA'] * len(self.__hdr)

        item['State'] = str(title.split(',')[-1]).strip().split()[0].upper()
        try:
            if item['State'].upper() in [
                    'VIC', 'NSW', 'QLD', 'WA', 'SA', 'NT'
            ]:
                item['State'] = item['State']
            else:
                item['State'] = list(
                    set(['VIC', 'NSW', 'QLD', 'WA', 'SA',
                         'NT']).intersection(title.split()))[0]
        except:
            item['State'] = 'NA'
        item['Postcode'] = str(title.split(',')[-1]).strip().split()[1]
        item['Suburb'] = str(title.split(',')[-2]).strip()

        listings = sel.xpath('//*[@class="summaryList"]/li/text()').extract()
        try:
            item['Address'] = title
        except:
            item['Address'] = 'NA'

        try:
            response_google = requests.get(
                'https://maps.googleapis.com/maps/api/geocode/json?address=' +
                item['Address'] + '\'')
            resp_json_payload = response_google.json()
            location = (
                resp_json_payload['results'][0]['geometry']['location'])
        except:
            location = {u'lat': 'na', u'lng': 'na'}
        item['dateCrawled'] = str(
            datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y"))
        item['source'] = 'real_commercial'
        item['PageTitle'] = str(
            sel.xpath('//*[@class="full-address"]/text()').extract()[0])
        item['DisplayablePrice'] = str(''.join(
            sel.xpath(
                '//*[@class="price ellipsis"]/text()').extract()).strip())
        item['PropertyDetailsUrl'] = str(response.url)
        item['ConjunctionalAgencyBannerURL'] = str(
            sel.xpath('//*[@class="agencyLogo"]/a/@href').extract()[0])
        item['OccupancyStatus'] = 'NA'
        res = requests.get(item['ConjunctionalAgencyBannerURL'])
        soup = BeautifulSoup(res.text)
        item['ConjunctionalAgencyContactPhone'] = str(
            soup.findAll(class_='phone hide-number')[0].contents[0])
        try:
            item['ParkingOptions'] = listings[listings.index('Car Spaces') + 1]
        except:
            item['ParkingOptions'] = 'NA'
        item['DisplayableAddressStreet'] = str(title.split(',')[0])

        item['DateFirstListed'] = 'NA'
        item['SaleID'] = str(
            sel.xpath('//*[@class="property_id"]/text()').extract()
            [0]).strip()
        item['EoiRecipientName'] = 'NA'
        item['AgencyBannerUrlCre'] = 'NA'
        item['HasInspections'] = 'NA'
        item['PropertyWebLink'] = str(response.url)
        item['EoiDeliveryAddress'] = 'NA'
        item['CardType'] = 'NA'
        item['Inspections'] = 'NA'
        item['Availability'] = 'NA'

        item['TenderDeliveryAddress'] = 'NA'
        item['Type'] = str(
            sel.xpath('//*[@class="propType last"]/text()').extract()
            [0]).strip()
        item['AgencyAddress'] = str(' '.join(
            sel.xpath('//*[@class="agencyAddr"]/p/text()').extract()))
        item['LeaseListingUrl'] = 'NA'
        item['InspectionTime'] = 'NA'
        item['ConjunctionalAgencyId'] = 'NA'
        item['DisplayableAddressSuburb'] = str(title.split(',')[1]).split()[0]
        item['Categories'] = str(
            sel.xpath('//*[@class="propType last"]/text()').extract()
            [0]).strip()

        item['AuctionDate'] = 'NA'
        item['AgencyLogoURLCRE'] = 'NA'
        item['VideoURL'] = 'NA'
        item['ConjunctionAgency'] = 'NA'
        item['ListingCategory'] = str(
            sel.xpath('//*[@class="propType last"]/text()').extract()
            [0]).strip()
        item['BuildingType'] = str(
            sel.xpath('//*[@class="propType last"]/text()').extract()
            [0]).strip()
        item['TenderEndDateAndTime'] = 'NA'
        item['ConjunctionalAgencyLogoLargeURL'] = 'NA'
        item['LeaseEndDate'] = 'NA'
        item['CaptionType'] = 'NA'
        try:
            item['LastUpdated'] = str(
                parser.parse(listings[listings.index('Last Updated') +
                                      1]).date())
        except:
            item['LastUpdated'] = 'NA'

        item['BrandingBannerUrl'] = 'NA'
        item['AgencyId'] = str(
            sel.xpath('//*[@class="agencyLogo"]/a/@href').extract()[0]).split(
                '-')[-1]
        item['AgencyName'] = str(
            sel.xpath('//*[@class="agencyName ellipsis"]/@title').extract()[0])
        item['PrimaryAgencyColor'] = 'NA'

        item['ListingContacts'] = 'NA'
        item['LogoUrl'] = 'NA'
        item['PriceDisplayText'] = str(''.join(
            sel.xpath('//*[@class="price ellipsis"]/text()').extract()).strip(
            ).encode("ascii", "ignore"))
        item['AdID'] = str(
            sel.xpath('//*[@class="property_id"]/text()').extract()
            [0]).strip()
        item['Images'] = 'NA'
        try:
            item['BuildArea'] = listings[listings.index('Floor Area') + 1]
        except:
            item['BuildArea'] = 'NA'

        try:
            item['BuildAreaDecimal'] = int(
                re.search(r'\d+', item['BuildArea'].replace(',', '')).group())
        except:
            item['BuildAreaDecimal'] = 'NA'

        item['MapLongitude'] = location['lng']

        item['LeaseStartDate'] = 'NA'
        item['DisplayAddressType'] = 'NA'
        item['AgencyLogoLargeURLCRE'] = 'NA'
        item['TenantName'] = 'NA'
        item['UnitDetails'] = 'NA'
        item['DatePlatinumBilling'] = 'NA'
        item['AnnualReturn'] = 'NA'
        item['AuctionAddress'] = 'NA'
        item['VideoInfo'] = 'NA'
        item['LeaseOptions'] = 'NA'
        try:
            item['LandArea'] = listings[listings.index('Land Area') + 1]
        except:
            item['LandArea'] = 'NA'
        try:
            item['LandAreaDecimal'] = int(
                re.search(r'\d+', item['LandArea'].replace(',', '')).group())
        except:
            item['LandAreaDecimal'] = 'NA'

        item['DisplayableAddressTruncated'] = 'NA'
        item['SaleType'] = 'NA'
        item['ConjunctionalAgencyAddress'] = 'NA'
        item['AuctionTerms'] = 'NA'
        item['NabersRating'] = 'NA'
        item['TenantInformation'] = 'NA'
        item['IsArchived'] = 'NA'
        item['TenderRecipientName'] = 'NA'
        item['EoiEndDateAndTime'] = 'NA'
        if item['ParkingOptions'] >= 0:
            item['Parking'] = 'Y'
        else:
            item['ParkingOptions'] = 'N'
        item['PrimaryImageFullSizeUrl'] = 'NA'
        item['Headline'] = str(
            sel.xpath('//*[@id="description"]/h3/text()').extract()[0].encode(
                "ascii", "ignore"))
        item['MapLatitude'] = location['lat']
        item['ConjunctionalAgencyName'] = 'NA'
        item['AdFormat'] = 'NA'
        item['Description'] = self.makeGood(' '.join(
            sel.xpath('//*[@class="body"]/text()').extract()).encode(
                "ascii", "ignore"))
        item['BuildOrLandArea'] = 'NA'
        item['EoiEndDate'] = 'NA'
        item['VirtualTourUrl'] = 'NA'
        item['PdfUploads'] = 1
        item['RentID'] = 'NA'
        item['ConjunctionalAgencyContactName'] = 'NA'
        item['TypeString'] = 'NA'
        item['ResultItemName'] = 'NA'
        item['BreadCrumbItems'] = 'NA'
        item['DateUpdated'] = 'NA'
        item['FirstPropertyTypeName'] = 'NA'
        item['DateEliteBilling'] = 'NA'
        item['TenantRentDetail'] = 'NA'
        item['SaleListingUrl'] = str(response.url)
        item['ConjunctionalAgencyLogoURL'] = 'NA'
        item['DisplayableAddress'] = title
        item['PageID'] = 'NA'
        item['TitanContentType'] = 'NA'
        item['IsSPA'] = 'NA'
        item['TitanAdZone'] = 'NA'
        item['ctype'] = 'NA'
        item['Member'] = 'NA'
        item['SubCategory4'] = 'NA'
        item['SubCategory2'] = 'NA'
        item['SubCategory3'] = 'NA'
        item['SubCategory1'] = 'NA'
        soup = BeautifulSoup(response.body, 'html.parser')
        item['PageName'] = str(soup.title.contents[0])
        item['PageType'] = 'NA'
        item['PageDescription'] = 'NA'
        item['IsTitanEnabled'] = 'NA'
        item['PrimaryCategory'] = 'NA'
        item['AdSlots'] = 'NA'
        item['startDate'] = 'NA'
        item['name'] = 'NA'
        item['url'] = str(response.url)
        item['sameAs'] = 'NA'
        item['context'] = 'NA'
        item['addressLocality'] = 'NA'
        item['addressRegion'] = 'NA'
        item['streetAddress'] = 'NA'
        item['postalCode'] = 'NA'
        item['type'] = 'NA'
        item['description'] = 'NA'
        item['TenantInfoTermOfLeaseFrom'] = 'NA'
        item['TenantInfoTermOfLeaseTo'] = 'NA'
        item['AgencyContacts'] = 'NA'
        item['AgencyID'] = 'NA'
        item['IsYoutube'] = 'NA'
        item['Height'] = 'NA'
        item['Width'] = 'NA'
        item['YoutubeId'] = 'NA'
        item['VideoRequired'] = 'NA'
        item['VideoSrc'] = 'NA'
        item['Autoplay'] = 'NA'
        try:
            item['PriceDecimal'] = int(
                re.search(r'\d+',
                          item['DisplayablePrice'].replace(',', '')).group())
        except:
            item['PriceDecimal'] = 'NA'
        imgs = sel.xpath('//*[@class="thumbImg"]/img/@src').extract()
        count = 0
        for im in imgs:
            item['ImageNumber_' + str(count) + '_url'] = str(im)
            item['ImageNumber_' + str(count) + '_url_transformed'] = str(im)
            count += 1

        for i in range(count, 30):
            item['ImageNumber_' + str(count) + '_url'] = "NA"
            item['ImageNumber_' + str(count) + '_url_transformed'] = "NA"

        agents = sel.xpath('//*[@class="agent clearfix"]').extract()

        count1 = 0
        for a in agents:
            soup = BeautifulSoup(a)
            item['Agent_' + str(count1) + '_Fax'] = 'NA'
            item['Agent_' + str(count1) + '_MugshotUrl'] = 'NA'
            try:
                item['Agent_' + str(count1) + '_Mobile'] = sel.xpath(
                    '//*[@class="agentPhone"]/a/@data-value').extract()[count1]
            except:
                item['Agent_' + str(count1) + '_Mobile'] = 'NA'
            item['Agent_' + str(count1) + '_Telephone'] = 'NA'
            item['Agent_' + str(count1) + '_Address'] = 'NA'
            try:
                item['Agent_' + str(count1) + '_FullName'] = str(
                    soup.findAll(class_='agentName ellipsis')[0].text)
            except:
                item['Agent_' + str(count1) + '_FullName'] = 'NA'
            count1 += 1

        for i in range(count1, 20):
            item['Agent_' + str(i) + '_Fax'] = 'NA'
            item['Agent_' + str(i) + '_MugshotUrl'] = 'NA'
            item['Agent_' + str(i) + '_Mobile'] = 'NA'
            item['Agent_' + str(i) + '_Telephone'] = 'NA'
            item['Agent_' + str(i) + '_Address'] = 'NA'
            item['Agent_' + str(i) + '_FullName'] = 'NA'

        return item
Beispiel #22
0
    def parse(self, response):
        url = response.meta['url']
        f = open("url.txt", "a")
        f.write(url + "\n")
        sel = Selector(response)
        title = sel.xpath("//h1/text()").extract()
        print(title, url)
        if len(title) != 0:
            title = title[0]
            kotlin_block = sel.xpath(
                "//pre[@class='prettyprint lang-kotlin']/text()")
            if len(kotlin_block) != 0:
                code = ""
                code_list = kotlin_block.extract()
                for item in code_list:
                    code += item
                print(code_list)
                co = Android()
                co.url = url
                co.title = title
                co.lang = "kotlin"
                co.content = code
                co.scrapy_time = datetime.datetime.now()
                co.domain = "android"
                co.save()

            java_block = sel.xpath(
                "//pre[@class='prettyprint lang-java']/text()")
            if len(java_block) != 0:
                code_list = java_block.extract()
                code = ""
                for item in code_list:
                    code += item
                print(code_list)
                row = [url, title, "java", code]
                co = Android()
                co.url = url
                co.title = title
                co.lang = "java"
                co.content = code
                co.scrapy_time = datetime.datetime.now()
                co.domain = "android"
                co.save()

            xml_block = sel.xpath(
                "//pre[@class='prettyprint lang-xml']/text()")
            if len(xml_block) != 0:
                code_list = xml_block.extract()
                code = ""
                for item in code_list:
                    code += item
                print(code_list)
                co = Android()
                co.url = url
                co.title = title
                co.lang = "xml"
                co.content = code
                co.scrapy_time = datetime.datetime.now()
                co.domain = "android"
                co.save()
            pre = sel.xpath("//pre[@translate='no']/text()")
            if len(pre) != 0:
                code_list = pre.extract()
                code = ""
                for item in code_list:
                    code += item
                print(code_list)
                row = [url, title, "pre_code", code]
                co = Android()
                co.url = url
                co.title = title
                co.lang = "unknown language"
                co.content = code
                co.scrapy_time = datetime.datetime.now()
                co.domain = "android"
                co.save()
    def parse_product(self, response):
        """
        解析单品页面
        """

        metadata = response.meta['userdata']
        sel = Selector(response)

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        # colors = self.fetch_color(response)
        # if colors:
        #     metadata['color'] = colors

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        # if not metadata.get('model'):
        #     model_node = sel.xpath('//div[@class="l-info-container"]/div[@class="l-info-title"]/h1')
        #     if model_node:
        #         try:
        #             model = model_node.xpath('./text()').extract()[0]
        #             model = self.reformat(model)
        #             if model:
        #                 metadata['model'] = model.upper()
        #                 metadata['name'] = model.lower()
        #         except(TypeError, IndexError):
        #             pass
        #
        # if not metadata.get('model'):
        #     return
        #
        # if not metadata.get('price'):
        #     price_node = sel.xpath('//div[@class="l-info-container"]/div[@class="l-info-title"]/h2')
        #     if price_node:
        #         try:
        #             price = price_node.xpath('./text()').extract()[0]
        #             price = self.reformat(price)
        #             if price:
        #                 metadata['price'] = price
        #         except(TypeError, IndexError):
        #             pass
        #
        # # 有两个部分都应该是description
        # # 这是图片右边的部分
        # description1 = None
        # description_node1 = sel.xpath('//div[@class="l-info-description"]/div/div[contains(@class, "description")]')
        # if description_node1:
        #     try:
        #         description1 = description_node1.xpath('./text()').extract()[0]
        #         description1 = self.reformat(description1)
        #     except(TypeError, IndexError):
        #         pass
        # # 这是图片左下的部分
        # description2 = None
        # description_node2 = sel.xpath(
        #     '//div[@class="l-details"]/div[contains(@class, "information")]/div[contains(@class, "description")]/div[@style]')
        # if description_node2:
        #     try:
        #         description2 = description_node2.xpath('./text()').extract()[0]
        #         description2 = self.reformat(description2)
        #     except(TypeError, IndexError):
        #         pass
        # # 组合两部分
        # description = '\r'.join(
        #     filter(None, [description1, description2])
        # )
        # description = self.reformat(description)
        # if description:
        #     metadata['description'] = description
        #
        # detail_nodes = sel.xpath(
        #     '//div[@class="l-details"]/div[contains(@class, "technical")]/*[not(@id="technicaldetails")][not(contains(@class, "button"))]')
        # if detail_nodes:
        #
        #     def func(node):
        #         try:
        #             node_name = node._root.tag
        #             allText = ''.join(self.reformat(val) for val in node.xpath('./text()').extract())
        #             # h5标签说明他是一行的开头
        #             if node_name == 'h5':
        #                 return '\r' + allText
        #             else:
        #                 return allText
        #         except(TypeError, IndexError):
        #             return ''
        #
        #     try:
        #         detail = ''.join(func(node) for node in detail_nodes)
        #         detail = self.reformat(detail)
        #         if detail:
        #             metadata['details'] = detail
        #     except(TypeError, IndexError):
        #         pass

        image_urls = []
        image_nodes = sel.xpath('//div[@id="scroll"]/ul/li[@data-hdimage]')
        for image_node in image_nodes:
            try:
                url = image_node.xpath('./@data-hdimage').extract()[0]
                url = self.reformat(url)
                if url:
                    url = self.process_href(url, response.url)
                    if url:
                        image_urls += [url]
            except (TypeError, IndexError):
                continue
        # if image_nodes:
        #     try:
        #         image_urls = [
        #             self.process_href(val, response.url)
        #             for val in image_nodes.xpath('./@data-hdimage').extract()
        #         ]
        #     except(TypeError, IndexError):
        #         pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item
    def parse(self, response):
        #self.driver.get(response.url)
        #sel = Selector(text=self.driver.page_source)
        sel = Selector(response)
        main_title = sel.xpath('//div[@class="section"]/h1[@id="suburb-name"]/text()').extract()
        suburb = sel.xpath('//div[@class="section"]/div[@class="subtitle h3 white"]/text()').extract()
        items = []
        item = RealstateMonthlyItem()
        item['title'] = main_title
        item['suburb_name'] = suburb
        items.append(item)
        house_trends = sel.xpath('//*[@class="slide-content"]')
        for house_trend in house_trends:
            item = RealstateMonthlyItem()
            house_avg_sales_jan_1 = []
            #house_avg_sales_jan_1 = sel.css('#highcharts-8 > svg > g.highcharts-series-group > g:nth-child(2) > path:nth-child(12)/text()').extract()
            #house_avg_sales_jan_1 = house_trend.xpath('//*[@id="highcharts-8"]').extract()
            #working_1: house_avg_sales_jan_1 = house_trend.xpath('//div[@class="slide-section median-price-subsections trend"] and contains(.., "price")').extract()
            #house_avg_sales_jan_1 = sel.xpath('.//div[contains(@class, "slide-section median-price-subsections trend") and contains(name(), '"price"') and contains(name(),'"count"')]').extract()
            house_avg_sales_jan_1 = house_trend.xpath('//div[contains(@class, "slide-section median-price-subsections trend") and contains(name(), '"price"') and contains(name(),'"count"')]').extract()
            #house_avg_sales_jan_1_demo = house_trend.xpath('//div[contains(@class, "slide-section median-price-subsections trend")]').extract()

            #item['created_time'] = re.search('[\d\-: ]+', selector.xpath('//div[@class="zwfbtime"]/text()').extract()[0]).group(0)
            #convert list object to string
            house_and_unit_data = str(house_avg_sales_jan_1)
            #clean up html tags and get unit price container
            unit_price_data = re.findall(r'\"unit\":\{\"\d+-\d+-\d+(?=).+true\}\}\,', house_and_unit_data)
            #get house price date and count data
            house_price_data = re.findall(r'\"house\":\{\"\d+-\d+-\d+(?=).+true\}\}\}\}', house_and_unit_data)
            #monthly data for unit
            unit_monthly_data = re.findall('\"2015\-\d+\-\d+\"\:\{\"price\"\:\d.+\,\"count\"\:\d+', str(unit_price_data))
            #yearly data for units
            unit_yearly_data = re.findall('\"20[0-9]{1}[^5]\-\d+\-\d+\"\:\{\"price\"\:\d+.\d+\,\"count\"\:\d+\}', str(unit_price_data))
            #monthly data for house
            house_monthly_data = re.findall('"2015\-\d+\-\d+\"\:\{\"price\"\:\d.+\,\"count\"\:\d+', str(house_price_data))
            #yearly data for house
            house_yearly_data = re.findall('\"20[0-9]{1}[^5]\-\d+\-\d+\"\:\{\"price\"\:\d+.\d+\,\"count\"\:\d+\}', str(house_price_data))
            #try:
            #street_number = re.search(r'([0-9\-]+)\s', data).group(1)
            #street_name = data.replace(street_number, '', 1).strip()
            #    return {'street_number': street_number, 'street_name': street_name}
            #except:
            #    return {'street_number': "", 'street_name': data.strip()}
            unit_monthly_date = re.findall(r'2015\-\d+\-\d+',str(unit_monthly_data))
            unit_monthly_price = re.findall(r'[0-9]{6}',str(unit_monthly_data))
            unit_monthly_no_of_sales = re.findall(r'\"count\":\d+',str(unit_monthly_data))
            replaced_unit_monthly_no_of_sales = str(unit_monthly_no_of_sales).replace('"count":', '')
            unit_yearly_date = re.findall('\d+\-\d+\-\d+',str(unit_yearly_data))
            unit_yearly_price = re.findall('[0-9]{6}',str(unit_yearly_data))
            unit_yearly_no_of_sales = re.findall('\"count\":\d+',str(unit_yearly_data))
            replaced_unit_yearly_no_of_sales = str(unit_yearly_no_of_sales).replace('"count":', '')
            house_monthly_date = re.findall(r'2015\-\d+\-\d+',str(house_monthly_data))
            house_monthly_price = re.findall(r'[0-9]{6}',str(house_monthly_data))
            house_monthly_no_of_sales = re.findall(r'\"count\":\d+',str(house_monthly_data))
            replaced_house_monthly_no_of_sales = str(house_monthly_no_of_sales).replace('"count":', '')
            house_yearly_date = re.findall('\d+\-\d+\-\d+',str(house_yearly_data))
            house_yearly_price = re.findall('[0-9]{6}',str(house_yearly_data))
            house_yearly_no_of_sales = re.findall('\"count\":\d+',str(house_yearly_data))
            replaced_house_yearly_no_of_sales = str(house_yearly_no_of_sales).replace('"count":', '')
            
            self.getListOfUnitMonthlyDates(unit_monthly_date, item)
            self.getListOfUnitMonthlyPrice(unit_monthly_price, item)
            self.getListofUnitMonthlyNoOfSales(replaced_unit_monthly_no_of_sales, item)

            #item['unit_mly_date'] = unit_monthly_date
            #item['unit_mly_price'] = unit_monthly_price
            #item['unit_mly_no_of_sales'] = replaced_unit_monthly_no_of_sales
            #item['unit_yly_date'] =  unit_yearly_date
            #item['unit_yly_price'] = unit_yearly_price
            #item['unit_yly_no_of_sales'] = replaced_unit_yearly_no_of_sales
            #item['house_mly_date'] =  house_monthly_date
            #item['house_mly_price'] = house_monthly_price
            #item['house_mly_no_of_sales'] = replaced_house_monthly_no_of_sales
            #item['house_yly_date'] =  house_yearly_date
            #item['house_yly_price'] = house_yearly_price
            #item['house_yly_no_of_sales'] = replaced_house_yearly_no_of_sales
            items.append(item)
            return items
Beispiel #25
0
    def parse(self, response):

        driver = response.meta['driver']

        WebDriverWait(driver, 500).until(
            EC.presence_of_element_located((By.ID, "react-select-2--value-item"))
        )
        firstinput = os.path.abspath(os.curdir) + "\company.txt"
        f = open(firstinput, "r")
        company_name = f.read().splitlines()

        secondinput = os.path.abspath(os.curdir) + "\street.txt"
        f = open(secondinput, "r")
        street_name = f.read().splitlines()

        thirdinput = os.path.abspath(os.curdir) + "\postcode.txt"
        f = open(thirdinput, "r")
        postcode = f.read().splitlines()

        search_input1 = driver.find_element_by_xpath('//*[@id="check-listing"]/div/div[1]/span/div/div/form/div[2]/input')
        search_input1.send_keys(company_name[0])

        search_input2 = driver.find_element_by_xpath('//*[@id="check-listing"]/div/div[1]/span/div/div/form/div[3]/input')
        search_input2.send_keys(street_name[0])

        search_input3 = driver.find_element_by_xpath('//*[@id="check-listing"]/div/div[1]/span/div/div/form/div[4]/input')
        search_input3.send_keys(postcode[0])

        search_button = driver.find_element_by_xpath('//*[@id="check-listing"]/div/div[1]/span/div/div/form/div[5]/div/input')
        search_button.click()

        driver = response.meta['driver']
        # time.sleep(3)
        WebDriverWait(driver, 500).until(
            EC.presence_of_element_located((By.CLASS_NAME, "ubsc_results-top-box-results-wrapper"))
        )

        html = driver.page_source
        response_obj = Selector(text=html)

        details = response_obj.xpath('//div[@class="ubsc_result-listing-row ubsc_not-found"]')

        print("\n"*2)
        print(len(details))
        print("\n" * 2)

        if(len(details)==0):
            print('All listing found')
            scope = ['https://www.googleapis.com/auth/documents.readonly', "https://www.googleapis.com/auth/drive.file",
                     "https://www.googleapis.com/auth/drive"]

            path = os.path.abspath(os.curdir) + "\client_secret.json"
            creds = ServiceAccountCredentials.from_json_keyfile_name(path, scope)

            DOCUMENT_ID = '1bsIEfAclZ5hZa32HgFknpkJLBB2GjFESYt3vRgDgyE0'

            service = build('docs', 'v1', credentials=creds, cache_discovery=False)

            document = service.documents().get(documentId=DOCUMENT_ID).execute()

            requests = [
                {
                    "insertText":
                        {
                            "text": 'All listing found '+'\n',
                            "location":
                                {
                                    "index": 1
                                }
                        }
                }
            ]
            result = service.documents().batchUpdate(documentId=DOCUMENT_ID, body={'requests': requests}).execute()
        else:
            for detail in details:
                name = detail.xpath('.//div[1]/div[1]/span/text()').get()
                self.sitename.append(name)

            print("\n" * 2)
            print(self.sitename)
            print("\n" * 2)

            scope = ['https://www.googleapis.com/auth/documents.readonly', "https://www.googleapis.com/auth/drive.file",
                     "https://www.googleapis.com/auth/drive"]

            path = os.path.abspath(os.curdir) + "\client_secret.json"
            creds = ServiceAccountCredentials.from_json_keyfile_name(path, scope)

            DOCUMENT_ID = ''

            service = build('docs', 'v1', credentials=creds, cache_discovery=False)

            document = service.documents().get(documentId=DOCUMENT_ID).execute()

            for name in self.sitename:
                requests = [
                    {
                        "insertText":
                            {
                                "text": 'No listing found on '+name+'\n',
                                "location":
                                    {
                                        "index": 1
                                    }
                            }
                    }
                ]
                result = service.documents().batchUpdate(documentId=DOCUMENT_ID, body={'requests': requests}).execute()
Beispiel #26
0
 def parse_page(self, response):
     sel = Selector(response)
     item = XiaobaiheItem()
     item['username'] = sel.xpath('//table/tr/td/a/text()').extract()[2]
     item['text'] = sel.xpath("//textarea/text()").extract()[0]
     return item
Beispiel #27
0
    def parsepage(self, htmldata):
        page = Selector(text=htmldata)
        items = page.xpath('//li[@data-item-type="tweet"]/div')
        resdata = []
        for item in items:
            usernameTweet = item.xpath(
                './/span[@class="username u-dir u-textTruncate"]/b/text()'
            ).extract()[0]
            resdata.append('usernameTweet: {}\n'.format(usernameTweet))
            ID = item.xpath('.//@data-tweet-id').extract()
            if not ID:
                continue
            ID = ID[0]
            resdata.append('ID: {}\n'.format(ID))
            name = item.xpath('.//@data-name').extract()[0]
            resdata.append('name: {}\n'.format(name))
            screen_name = item.xpath('.//@data-screen-name').extract()[0]
            resdata.append('screen_name: {}\n'.format(screen_name))
            avatar = item.xpath(
                './/div[@class="content"]/div[@class="stream-item-header"]/a/img/@src'
            ).extract()[0]
            resdata.append('useravatar: {}\n'.format(avatar))
            # 获取twitter文本
            text = ' '.join(
                item.xpath('.//div[@class="js-tweet-text-container"]/p//text()'
                           ).extract()).replace(' # ',
                                                '#').replace(' @ ', '@')
            if text == '':
                # 没有twitter文本就直接跳过这个div
                continue
            resdata.append('text: {}\n'.format(text))
            usrurl = item.xpath('.//@data-permalink-path').extract()[0]
            resdata.append('usrurl: https://twitter.com{}\n'.format(usrurl))
            # nbr_retweet = item.css('span.ProfileTweet-action--retweet > span.ProfileTweet-actionCount').xpath(
            #     '@data-tweet-stat-count').extract()
            # if nbr_retweet:
            #     nbr_retweet = int(nbr_retweet[0])
            # else:
            #     nbr_retweet = 0
            # resdata.append('nbr_retweet: {}\n'.format(nbr_retweet))
            # nbr_favorite = item.css('span.ProfileTweet-action--favorite > span.ProfileTweet-actionCount').xpath(
            #     '@data-tweet-stat-count').extract()
            # if nbr_favorite:
            #     nbr_favorite = int(nbr_favorite[0])
            # else:
            #     nbr_favorite = 0
            # resdata.append('nbr_favorite: {}\n'.format(nbr_favorite))
            # nbr_reply = item.css('span.ProfileTweet-action--reply > span.ProfileTweet-actionCount').xpath(
            #     '@data-tweet-stat-count').extract()
            # if nbr_reply:
            #     nbr_reply = int(nbr_reply[0])
            # else:
            #     nbr_reply = 0
            # resdata.append('nbr_reply: {}\n'.format(nbr_reply))
            getdatetime = datetime.datetime.fromtimestamp(
                int(
                    item.xpath(
                        './/div[@class="stream-item-header"]/small[@class="time"]/a/span/@data-time'
                    ).extract()[0])).strftime('%Y-%m-%d %H:%M:%S')
            resdata.append('datetime: {}\n'.format(getdatetime))
            ### get photo
            has_cards = item.xpath('.//@data-card-type').extract()
            if has_cards and has_cards[0] == 'photo':
                has_image = True
                images = item.xpath('.//*/div/@data-image-url').extract()
                resdata.append('imgpath: {}\n'.format(images))
            elif has_cards:
                print('Not handle "data-card-type":\n%s' %
                      item.xpath('.').extract()[0])
            ### get animated_gif
            has_cards = item.xpath('.//@data-card2-type').extract()
            if has_cards:
                if has_cards[0] == 'animated_gif':
                    has_video = True
                    videos = item.xpath('.//*/source/@video-src').extract()
                    resdata.append('videos: {}\n'.format(videos))
                elif has_cards[0] == 'player':
                    has_media = True
                    medias = item.xpath('.//*/div/@data-card-url').extract()
                    resdata.append('medias: {}\n'.format(medias))
                elif has_cards[0] == 'summary_large_image':
                    has_media = True
                    medias = item.xpath('.//*/div/@data-card-url').extract()
                    resdata.append('medias: {}\n'.format(medias))
                elif has_cards[0] == 'amplify':
                    has_media = True
                    medias = item.xpath('.//*/div/@data-card-url').extract()
                    resdata.append('medias: {}\n'.format(medias))
                elif has_cards[0] == 'summary':
                    has_media = True
                    medias = item.xpath('.//*/div/@data-card-url').extract()
                    resdata.append('medias: {}\n'.format(medias))
                elif has_cards[0] == '__entity_video':
                    pass  # TODO
                    # tweet['has_media'] = True
                    # tweet['medias'] = item.xpath('.//*/div/@data-src').extract()
                else:  # there are many other types of card2 !!!!
                    print('Not handle "data-card2-type":\n%s' %
                          item.xpath('.').extract()[0])

            is_reply = item.xpath(
                './/div[@class="ReplyingToContextBelowAuthor"]').extract()
            is_reply = is_reply != []
            resdata.append('is_reply: {}\n'.format(is_reply))

            is_retweet = item.xpath(
                './/span[@class="js-retweet-text"]').extract()
            is_retweet = is_retweet != []
            resdata.append('is_retweet: {}\n'.format(is_retweet))

            user_id = item.xpath('.//@data-user-id').extract()[0]
            resdata.append('user_id: {}\n'.format(user_id))
            resdata.append('\n')
        self.writeresdata(resdata)
Beispiel #28
0
    def parse(self, response):
        item = JdOnepageItem()
        selector = Selector(response)
        #product_id = selector.xpath('//*[@id="parameter2"]/li[2]/text()').extract()
        name = selector.xpath('//*[@id="parameter2"]/li[1]/@title').extract()
        price = selector.xpath('//*[@id="jd-price"]/text()').extract()
        saler = selector.xpath('//*[@id="extInfo"]/div[2]/em/text()').extract()
        type1 = selector.xpath(
            '//*[@id="root-nav"]/div/div/strong/a/text()').extract()
        #type21 = selector.xpath('//*[@id="root-nav"]/div/div/span[1]/a[1]/text()').extract()
        type22 = selector.xpath(
            '//*[@id="root-nav"]/div/div/span[1]/a[2]/text()').extract()
        #type23 = selector.xpath('//*[@id="root-nav"]/div/div/span[2]/a[1]/text()').extract()
        #type24 = selector.xpath('//*[@id="root-nav"]/div/div/span[2]/a[2]/text()').extract()
        global price
        global TMS
        global URL
        global product_id
        global product
        print product_id
        #json_url = 'http://p.3.cn/prices/mgets?skuIds=J_' + str(product_id)
        #r = requests.get(json_url).text
        #data = json.loads(r)[0]
        #price = data['m']
        item['num_id'] = TMS
        item['name'] = name
        item['price'] = price
        item['product_id'] = product_id
        item['saler'] = saler
        item['URL'] = URL
        item['Type1'] = type1
        item['Type2'] = type22
        if (TMS > 0):
            yield item
        TMS += 1

        f = open(
            u"F:\\GitRespo\\ShopSpider\\Auto\\DataWare\\JDTotal\\$$query$$.csv",
            "r")
        if (TMS < len(f.readlines())):
            line = linecache.getline(
                u"F:\\GitRespo\\ShopSpider\\Auto\\DataWare\\JDTotal\\$$query$$.csv",
                TMS + 1)
            price = 0
            product_id = 0
            URL = 0
            product = line.split(',')
            if product[-1].strip() == '':
                TMS += 1
                line = linecache.getline(
                    u"F:\\GitRespo\\ShopSpider\\Auto\\DataWare\\JDTotal\\$$query$$.csv",
                    TMS + 1)
                product = line.split(',')
            nextLink = "http://item.jd.com/" + product[-1].strip() + ".html"
            #price = product[0]
            URL = nextLink
            product_id = product[-1].strip()
            yield Request(nextLink, self.parse)
            f.close()
        else:
            exit(-1)
Beispiel #29
0
    def parse_new_ok(self, response):
        #访问网页
        page_ok = Selector(response)
        #建立item
        item = SpiderymItem()
        imgs = list()
        urls = list()
        content_text = list()
        yes_or_no = True
        #--------------------------------------------------------标题
        title = response.meta['title']
        if title:
            #如果有标题
            #print('title-------------------',title)
            item['title_0818'] = title
            content_text.append(title)  #放入title,后面可以转换成元组然后把重复的删除掉,以防标题和内容重复
        #--------------------------------------------------------时间,没什么用
        time = response.meta['time']
        if time:
            #如果有时间
            item['time_0818'] = time

        #-----------------------------------------------------------整个内容大块,全部内容
        contents = page_ok.xpath('//div[@class="post-content"]')
        content_ss = contents.xpath(
            './descendant-or-self::*')  #注意,这里除了P还有其他的节点,比如div

        num_2 = len(content_ss.extract())
        print('-----------共有', num_2, '个节点')

        for content in content_ss:
            ls_text = content.xpath('./text()').extract()  #文本内容
            #print('文本内容数量---------',len(content.xpath('./text()').extract()))
            #ls_url = content.xpath('./a/@href').extract_first() #链接
            #ls_url = content.xpath('./a')
            ls_urls = self.zh_url_mapping(content)  #字典,获取a标签,方便后面获取链接和链接的文本,转换
            #print('链接内容数量---------',len(content.xpath('./a/@href').extract()))
            #ls_url_text = content.xpath('./a/text()').extract_first()  #链接文本
            ls_img = content.xpath('./img/@src').extract_first()  #图片

            #难点
            #1、ls_url也要设置成列表!!!否则一些P1下的链接获取不到比如http://www.0818tuan.com/xbhd/235344.html
            #链接替换原本的文本中的 链接
            #2、怎么按顺序排列  P1下的 文字、链接、文字、链接。 现在是一整块的 文字,和一整块的链接
            '''    
            if ls_url:
                #如果是短地址,则忽略
                ls_url = self.zh_url(ls_url)#去除链接中的网站前缀 0818tuan.com.......
                if ls_url.find('http') != -1:   #找到了http
                    #如果没用http,说明是短链接,/xbhd/235097.html,忽略,否则,加入链接和文本
                    #还有种情况 /tao/taoke.php?item_id=534465655558  ,他用js省略了链接。这种情况不要也罢,是网站的链接
                    urls.append(ls_url) #加入到链接
                    content_text.append(ls_url) #加入到文本内容
            '''
            if ls_text:
                #如果有文本,列表
                #ls_text = self.zh_text(ls_text) #去除空格等, 去除空格,顺序就会错乱
                if len(ls_text) > 1:
                    print('找到好多文本------有', len(ls_text), '个')
                content_ls = ';'.join(ls_text)
                #print('合并的文本------------------',content_ls)
                if ls_urls:
                    for ls_url in ls_urls:
                        #ls_url是字典的key
                        content_ls = content_ls.replace(
                            ls_url, '----' + ls_urls[ls_url])  #替换掉文本中的链接文本

                content_text.append(content_ls)  #也改成列表

            if ls_urls:
                urls.append(ls_urls[ls_url])

            if ls_img:
                #如果有图片
                #print('img------------',ls_img)
                imgs.append(ls_img)

        num_urls = len(urls)
        num_imgs = len(imgs)
        num_cts = len(content_text)

        if num_imgs == 0 and num_urls == 0:
            #如果图片和链接都没有(只有文字),则不采集
            yes_or_no = False
        if self.is_keyword_valid(title, True):
            #但是如果含有标题 关键字,则采集
            yes_or_no = True

        print('yes_or_no-------------', yes_or_no)
        print('原链接个数--------------', num_urls)
        print('图片个数--------------', num_imgs)
        print('原文本个数--------------', num_cts)
        print('原文本内容--------------', content_text)

        cts = self.zh_cts(content_text, False)  #去除文本中没用的,必须先这个,再去除重复,否则顺序会错乱
        #cts = list(set(cts))  #去除文本中重复项,去重复会顺序错乱
        cts = self.zh_cf(cts)  #这样去重复,没问题
        urls = list(set(urls))  #去除链接中重复项

        #----------------------------查找完所有图片/文本/链接,放入item
        item['cts_0818'] = cts
        item['urls_0818'] = urls
        item['imgs_0818'] = imgs
        item['yon'] = yes_or_no

        yield item
        '''
        #print(contents)
        #-----------------------------------------------------------所有div节点 (少部分有),给链接
        #这种链接好像都带有0818tuan,自己的链接。考虑是否删掉
        div = contents.xpath('./div')
        if div:
            #如果有div
            print('有div------------',div.extract())
            #临时url  
            url_ls = div.xpath('./a/@href')
            if url_ls:
                #如果有链接,一般有div就有链接
                urls.append('div中的链接--' + url_ls.extract_first())
        #------------------------------------------------------------整个内容大块的text,给内容
        #全部文本
        all_ct = contents.xpath('string(.)').extract()
        print(type(all_ct))
        print('全部1---------',all_ct)
        print(len(all_ct),'个----------------')
        print(len(all_ct[:-4]),'个----------------')
        for i in all_ct[:-5]:
            all_ct2 = ''.join(i.split())
            print('全部2---------',all_ct2)

        text = contents.xpath('./text()')
        for i in text.extract():
            #去除 '\r\n                '这样的文本
            #b_ls = i.replace('\n','').replace('\r','').replace(' ','')
            b_ls = "".join(i.split())
            if b_ls != '':
                #如果有内容,  
                print('大块中的文本 ------------',b_ls)
                content_text += b_ls

        #-------------------------------------------------------查找所有段落,找链接、图片
        p = contents.xpath('./p')
        #临时P_ls,全部展开,删除后5个P段落
        p_ls = p.extract()[:-5]
        for i in range(len(p_ls)):  #---------------------------遍历所有段落
            #如果到了P 段,如何写 xpath 续写后面的,?
            str_1 = './p[{}]/text()'.format(i+1) #----------------查找段落中的文本
            text_1 = contents.xpath(str_1).extract_first()  #文本给内容
            if text_1:
                #如果有内容,内容先不记录,再看如何判断放入哪些文本进去
                c_ls = "".join(text_1.split())
                content_text += c_ls
                print('P 中的 文本----------',c_ls)
            #--------------------------------------------------------查找段落的是否有链接
            str_2 = './p[{}]/a/@href'.format(i+1) #获取P段落下a节点的href,一般是链接,有下面两种链接
            #http://m.0818tuan.com/jd/?u=https%3A%2F%2Fitem.jd.com%2F51501863225.html
            #http://m.0818tuan.com/suning/?visitUrl=https%3A%2F%2Fproduct.suning.com%2F0000000000%2F10310212467.html
            text_2 = contents.xpath(str_2).extract()  #给链接,也要给内容,可能有多个链接
            if text_2:
                #如果找到了 链接,则放入 url列表
                for x in text_2:
                    url_ls = self.url_zh(x)
                    print('链接----------',url_ls)
                    urls.append('P段落中的链接--' + url_ls)
                #给内容
                    content_text = content_text + url_ls + '\n'
            #-------------------------------------------------------查找段落是否有图片内容
            str_3 = './p[{}]/img/@src'.format(i+1)  #获取图片链接
            text_3 = contents.xpath(str_3).extract_first()
            if text_3:
                #如果有图片
                imgs.append(text_3)
                print('图片地址-----------',text_3)


        #----------------------------查找完所有图片/文本/链接,放入item
        item['content_0818'] = content_text
        item['url_0818'] = urls
        item['imgs_0818'] = imgs
        
        if len(p) > 6:
            #去掉后面5个 P段落
            print('警告-----------大于6个段落')
            for i in range(len(p[:-5])):
                cc_1 = './p[{}]/text()'.format(i)
                print('cc_1-------------',cc_1)
                cc_2 = contents.xpath(cc_1).extract_first()
                print('cc_2-------------',cc_2)
                #如果有内容
                if cc_2:
                    cc_a = re.sub(r'(<br/?>\s*\n?)+', '@#@',cc_2)
                    cc_b = cc_a.replace('@#@','\n')
                    print('大于6个段落里的内容有---------------',cc_b)
                    #先不管图片
                    item['content_0818'] = cc_b
            yield item

            src = contents.xpath('./p[1]/img/@src').extract()
            if len(src) > 0:
                #找图片,如果没有,则跳出循环,如果有,则存入列表,因为可能有多张图片
                for i in range(len(src)):
                    imgs.append(src[i])
                print(imgs)
                item['imgs_0818'] = imgs
            else:
                print('没有图片')
        
        '''
        '''    
Beispiel #30
0
 def parse(self, response):
     item = Bills()
     sel = Selector(response)
     try:
         item['election_year'] = self.election_year[sel.xpath(
             '//span[@id="lab_MDSL"]/text()').re(u'第(\d+)屆')[0]]
     except:
         return
     item['county'] = u'新北市'
     item['id'] = re.findall(u'=(\d+)$', response.url)[0]
     item['type'] = sel.xpath('//span[@id="lab_BillType"]/text()').extract(
     )[0].strip() if sel.xpath(
         '//span[@id="lab_BillType"]/text()').extract() else ''
     item['category'] = sel.xpath(
         '//span[@id="lab_BillClass"]/text()').extract()[0].strip(
         ) if sel.xpath(
             '//span[@id="lab_BillClass"]/text()').extract() else ''
     item['proposed_by'] = sel.xpath(
         '//span[@id="lab_Provider"]/text()').extract()[0].strip().split(
             u',') if sel.xpath(
                 '//span[@id="lab_Provider"]/text()').extract() else []
     item['petitioned_by'] = sel.xpath(
         '//span[@id="lab_SupportMan"]/text()').extract()[0].strip().split(
             u',') if sel.xpath(
                 '//span[@id="lab_SupportMan"]/text()').extract() else []
     item['abstract'] = '\n'.join([
         re.sub('\s', '', x) for x in sel.xpath(
             '//span[@id="lab_Reason"]/div//text()').extract()
     ])
     item['description'] = '\n'.join([
         re.sub('\s', '', x) for x in sel.xpath(
             '//span[@id="lab_Description"]/div//text()').extract()
     ])
     item['methods'] = '\n'.join([
         re.sub('\s', '', x) for x in sel.xpath(
             '//span[@id="lab_Method"]/div/text()').extract()
     ])
     motions = []
     motions.append(
         dict(
             zip(['motion', 'resolution', 'date'], [
                 u'一讀決議', '\n'.join([
                     re.sub('\s', '', x) for x in sel.xpath(
                         '//span[@id="lab_OneResult"]//text()').extract()
                 ]), None
             ])))
     motions.append(
         dict(
             zip(['motion', 'resolution', 'date'], [
                 u'審查意見', '\n'.join([
                     re.sub('\s', '', x) for x in sel.xpath(
                         '//span[@id="lab_ExamResult"]//text()').extract()
                 ]), None
             ])))
     motions.append(
         dict(
             zip(['motion', 'resolution', 'date'], [
                 u'大會決議', '\n'.join([
                     re.sub('\s', '', x) for x in sel.xpath(
                         '//span[@id="lab_Result"]//text()').extract()
                 ]), None
             ])))
     motions.append(
         dict(
             zip(['motion', 'resolution', 'date'], [
                 u'二讀決議', '\n'.join([
                     re.sub('\s', '', x) for x in sel.xpath(
                         '//span[@id="lab_TwoResult"]//text()').extract()
                 ]), None
             ])))
     motions.append(
         dict(
             zip(['motion', 'resolution', 'date'], [
                 u'三讀決議', '\n'.join([
                     re.sub('\s', '', x) for x in sel.xpath(
                         '//span[@id="lab_ThreeResult"]//text()').extract()
                 ]), None
             ])))
     item['motions'] = motions
     item['links'] = response.url
     return item