Python ItemLoader.get_collected_values Beispiele, scrapy.loader.ItemLoader.get_collected_values Python Beispiele

Beispiel #1

0

Datei anzeigen

    def get_news(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        l.add_value(
            'title',
            response.xpath('//div[@id="contentwrap"]/h1/text()').extract())

        l.add_value('date',
                    response.xpath('//div[@class="infos"]/p/text()').extract())

        r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}\:\d{2}"
        date0 = re.compile(r1)
        date = ''.join(l.get_collected_values('date'))
        date1 = date0.findall(date)
        l.replace_value('date', date1[0])
        l.add_value('content',
                    response.xpath('//div[@class="content"]/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="description"]/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="content"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="content"]/div/p/text()').extract())

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        return l.load_item()

Beispiel #2

0

Datei anzeigen

    def get_news(self,response):
	try:
            l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//span[@id="thread_subject"]/text()').extract())

            l.add_value('date',response.xpath('//div[@class="authi"]/em/text()').extract())

            r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}"
	    date0 = re.compile(r1)
	    date = ''.join(l.get_collected_values('date'))
	    date1 = date0.findall(date)
            l.replace_value('date', date1[0])
            l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/br/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/p/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/div/div/font/font/strong/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()

Beispiel #3

0

Datei anzeigen

Datei: hxzg_spider.py Projekt: taianjianbing/web_news

    def get_news(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        l.add_value('title',
                    response.xpath('//table/tr[3]/td[2]/text()').extract())

        l.add_value('date',
                    response.xpath('//table/tr[4]/td/text()').extract())

        r1 = r"\d{4}\-\d{1,2}\-\d{1,2}"
        date0 = re.compile(r1)
        date = ''.join(l.get_collected_values('date'))
        date1 = date0.findall(date)
        l.replace_value('date', date1[0])
        l.add_value(
            'content',
            response.xpath('//td[@class="tdbg"]/div/font/text()').extract())
        l.add_value(
            'content',
            response.xpath('//td[@class="tdbg"]/p/font/text()').extract())
        l.add_value(
            'content',
            response.xpath('//td[@class="tdbg"]/p/span/text()').extract())
        l.add_value('content',
                    response.xpath('//td[@class="tdbg"]/p/text()').extract())

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        l.add_value('website', self.website)
        return l.load_item()

Beispiel #4

0

Datei anzeigen

    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@id="biaoti"]/text()').extract())
	    l.add_value('title', response.xpath('//h1[@id="biaoti"]/text()').extract())

            l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract())
	    l.add_value('date',response.xpath('//div[@class="center lh32 grey12a"]/text()').extract())
	    l.add_value('date',response.xpath('//div[@id="left"]/h2/text()').extract())

            l.add_value('content',response.xpath('//div[@id="zw"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="zw"]/strong/p/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
	    url = response.url
	    if url[11:17]=="shzfzz":
                date = ''.join(l.get_collected_values('date'))
                date = time.strptime(date.split()[0], u'%Y年%m月%d日')
                l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))
            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()

Beispiel #5

0

Datei anzeigen

Datei: uludag_crawler.py Projekt: ByUnal/Web-Crawler-by-using-Scrapy-with-MongoDB

    def parse_detail(self, response):
        t_name = response.xpath("//h1/a/text()").extract_first()
        for post in response.xpath("//li[@class='li_capsul_entry']"):
            l = ItemLoader(item=UludagtutorialItem(), selector=post)
            l.add_value("title", response.meta.get('title_name', t_name))
            l.add_xpath(
                "comment",
                ".//div[@class='entry-p']/text() | .//div[@class='entry-p']/a/text()"
            )
            l.add_xpath(
                "user",
                ".//div[@class='entry-secenekleri']/a[@class='alt-u yazar']/text()"
            )
            l.add_xpath("date", ".//span[@class='date-u']/a/text()")
            l.add_xpath(
                "url",
                "substring-after(.//div[@class='voting_nw']/a/@href, '//')")

            yield scrapy.FormRequest(
                "https://www.uludagsozluk.com/ax/?a=yenit&ne=ben&nw=pop",
                formdata={"benu": l.get_collected_values('user')[0]},
                method='POST',
                callback=self.parse_post_detail,
                dont_filter=True,
                meta={'l': l})

        next_page_url = response.xpath(
            "//a[@class='nextpage']/@href").extract_first()

        if next_page_url is not None:
            yield scrapy.Request("https://www.uludagsozluk.com" +
                                 next_page_url,
                                 callback=self.parse_detail,
                                 dont_filter=True)

Beispiel #6

0

Datei anzeigen

    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value("title", response.xpath('//h1[@id="title"]/text()').extract_first())
            loader.add_value("title", response.xpath('//span[@id="title"]/text()').extract_first())

            loader.add_value("date", response.xpath('//span[@class="time"]/text()').extract_first())
            loader.add_value("date", response.xpath('//span[@id="pubtime"]/text()').extract_first())
            date = ''.join(loader.get_collected_values("date")).strip()
            date = time.strptime(date, '%Y年%m月%d日 %H:%M:%S')
            loader.replace_value("date", time.strftime("%Y-%m-%d %H:%M:%S", date))

            loader.add_value("content",
                             ''.join(response.xpath('//div[@id="content"]/descendant-or-self::text()').extract()))
            loader.add_value("content",
                             ''.join(response.xpath('//div[@class="article"]/descendant-or-self::text()').extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()

Beispiel #7

0

Datei anzeigen

Datei: zgqnw_spider.py Projekt: ZhangJun93/web_news

    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@class="l_tit"]/text()').extract())

            l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract())

            date = ''.join(l.get_collected_values('date'))
            #date = time.strptime(date.split()[0], '%Y-%m-%d')
            #l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))

            l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/strong/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="article"]/div/p/text()').extract())

 
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()

Beispiel #8

0

Datei anzeigen

Datei: PTT.py Projekt: Hao-TW/Python-new

    def parse_item(self, response):

        l = ItemLoader(item=PropertiesItem(), response=response)

        l.add_xpath('author', '//*[@id="main-content"]/div[1]/span[2]/text()')
        l.add_xpath('title', '//*[@id="main-content"]/div[3]/span[2]/text()')
        l.add_xpath('datetime',
                    '//*[@id="main-content"]/div[4]/span[2]/text()')
        l.add_xpath('board', '//*[@id="main-content"]/div[2]/span[2]/text()')

        l.add_xpath('category',
                    '//*[@id="main-content"]/div[3]/span[2]/text()',
                    re='^\[.+\]')
        if len(l.get_collected_values('category')) == 0:
            l.add_xpath('category',
                        '//*[@id="main-content"]/div[3]/span[2]/text()',
                        re='^Re')

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('rtrv_date', datetime.datetime.now())

        return l.load_item()

Beispiel #9

0

Datei anzeigen

Datei: xfjy_spider.py Projekt: sunny8898/homeland

    def parse_article(self, response):
        loader = ItemLoader(item=XfjyArticleItem(), response=response)

        article_url = response.url
        title = response.meta["title"]
        # date = response.meta["date"]
        tags_list = response.meta["tags_list"]
        block_type = ",".join(tags_list)

        # 暂时attachments放在这里
        attchments = response.xpath(
            "//div[@class='main_nei_you_baio_content']//span//a")
        names_urls = [(attchment.xpath(".//span//text()").extract_first(),
                       attchment.xpath(".//@href").extract_first())
                      for attchment in attchments]
        name_url = {name: response.urljoin(url) for name, url in names_urls}
        attchments = json.dumps(name_url, ensure_ascii=False)

        index = response.meta.get("index")

        loader.add_value("article_url", article_url)
        loader.add_value("title", title)
        loader.add_value("tags_list", tags_list)
        loader.add_value("block_type", block_type)
        loader.add_value("attch_name_url", attchments)
        loader.add_xpath(
            "author",
            "//div[@class='main_nei_you_baio_content']//span[@class='authorstyle44003']//text()"
        )
        loader.add_value(
            "content",
            response.xpath(
                "//div[@class='main_nei_you_baio_content']//td[@class='contentstyle44003']"
            ))
        loader.add_xpath(
            "img",
            "//div[@class='main_nei_you_baio_content']//td[@class='contentstyle44003']//@src"
        )
        loader.add_xpath(
            "detail_time",
            "//div[@class='main_nei_you_baio_content']//span[@class='timestyle44003']//text()"
        )
        loader.add_value("index", index)

        imgs = loader.get_collected_values("img")
        if imgs:
            for img in imgs:
                if "http" in img:
                    yield Request(img,
                                  callback=self.parse_img,
                                  dont_filter=True,
                                  meta={
                                      "type": "image",
                                      "article_url": response.url
                                  })

        yield loader.load_item()

Beispiel #10

0

Datei anzeigen

Datei: qx162.py Projekt: qiangber/scrapy_news

    def get_news(self, response):
        try:
            loader = ItemLoader(item=SpiderItem(), response=response)

            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="left"]/h1/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath('//h1[@class="h1"]/text()').extract_first())

            loader.add_value(
                'date',
                response.xpath('//div[@class="zuoze"]/text()').extract_first())
            loader.add_value(
                'date',
                response.xpath(
                    '//span[@class="post-time"]/text()').extract_first())
            date = ''.join(loader.get_collected_values('date'))
            if date == '':
                return
            loader.replace_value('date', date.strip() + ":00")

            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//span[@id="zoom"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//p[@class="summary"]/descendant-or-self::text()').
                    extract()))

            loader.add_value('url', response.url)
            loader.add_value('collection_name', self.name)
            loader.add_value('website', self.website)

            yield loader.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            yield l.load_item()

Beispiel #11

0

Datei anzeigen

    def parse(self, response):
        load_data = ItemLoader(item=GoogleSearchBlockData(), response=response)
        load_data.add_xpath('name', '//div[@class="BNeawe vvjwJb AP7Wnd"]/text()')
        load_data.add_xpath('url', '//div[@class="kCrYT"]//a/@href')
        load_data.add_xpath('next_site', '//a[@class="nBDE1b G5eFlf"]/@href')
        next_page = load_data.get_collected_values('next_site')

        try:
            next_page = next_page[next_page.__len__()-1]
            yield load_data.load_item()
            yield scrapy.Request(next_page, callback=self.parse)
        except IndexError:
            self.log('\n\n Moj LOGGER: \n'+ 'Zakończono pobieranie'+'\n\n')

Beispiel #12

0

Datei anzeigen

Datei: yiban_spider.py Projekt: sunny8898/homeland

    def parse_article_ajax(self, response):
        try:
            article = json.loads(response.body)["data"]["article"]
        except BaseException as e:
            self.log(response.body.decode(), level=logging.DEBUG)
            self.log("解析json过程出现错误，没有article，链接：{},错误：{}".format(
                response.url, str(e)),
                     level=logging.ERROR)
            self.log("{}".format(str(response.request.body)),
                     level=logging.ERROR)
        else:
            loader = ItemLoader(item=YibanArticleItem(), response=response)
            article_url = response.meta.get("article_url")
            title = response.meta.get("title")
            tags_list = [
                "易班",
            ]
            tags_list.append(article.get("Sections_name"))
            block_type = ",".join(tags_list)
            content = article.get("content")
            detail_time = article.get("createTime")

            # 易班的网站上面没有附件
            attchments = ""

            index = response.meta.get("index")

            loader.add_value("article_url", article_url)
            loader.add_value("title", title)
            loader.add_value("tags_list", tags_list)
            loader.add_value("block_type", block_type)
            loader.add_value("content", content)

            content_response = Selector(text=content)
            loader.add_value("img",
                             content_response.xpath("//img//@src").extract())
            loader.add_value("detail_time", detail_time)
            loader.add_value("index", index)

            imgs = loader.get_collected_values("img")
            if imgs:
                for img in imgs:
                    if "http" in img:
                        yield Request(img,
                                      callback=self.parse_img,
                                      dont_filter=True,
                                      meta={
                                          "type": "image",
                                          "article_url": article_url
                                      })
            yield loader.load_item()

Beispiel #13

0

Datei anzeigen

    def parse(self, response):
        base_post = response.css('article.post_preview')

        for post in base_post:
            item = ItemLoader(authorItem(), response)
            for key, value in self.author_css_selectors.items():
                item.add_value(key, post.css(value).extract())

            yield item.load_item()
            yield response.follow(item.get_collected_values('author_url')[0] +
                                  'posts/',
                                  callback=self.parse_author)

        yield response.follow(response.css(
            self.line_post_css_selectors['post_url']).extract()[0],
                              callback=self.parse_post)

Beispiel #14

0

Datei anzeigen

Datei: Crawl.py Projekt: isaacngcs/Kdrama

    def parse(self, response):

        l = ItemLoader(
            item=LinksItem(),
            response=response,
        )
        l.add_value('url', response.meta['source_url'])
        links = set([])
        for n in [1, 2]:
            selector = 'div.entrytext p:nth-child(%s) a ::attr(href)' % n
            for link in response.css(selector).extract():
                if '/tag/' not in link:
                    links.add(link)
        l.add_value('links', list(links))
        l.add_value('count', len(l.get_collected_values('links')))

        yield l.load_item()

Beispiel #15

0

Datei anzeigen

Datei: characters.py Projekt: fiqrisr/disney_fandom_crawler

    def parse_detail(self, response, char):
        loader = ItemLoader(item=char, response=response)

        loader.add_value("url", response.url)
        loader.add_css("image", selectors["CHARACTER_IMAGE"])
        loader.add_css("name", selectors["CHARACTER_NAME"])
        loader.add_css("feature_films", selectors["CHARACTER_FEATURE_FILMS"])
        loader.add_css("short_films", selectors["CHARACTER_SHORT_FILMS"])
        loader.add_css("shows", selectors["CHARACTER_SHOWS"])
        loader.add_css("games", selectors["CHARACTER_GAMES"])
        loader.add_css("rides", selectors["CHARACTER_RIDES"])
        loader.add_css("animator", selectors["CHARACTER_ANIMATOR"])
        loader.add_css("designer", selectors["CHARACTER_DESIGNER"])
        loader.add_css("voice", selectors["CHARACTER_VOICE"])
        loader.add_css("portrayed_by", selectors["CHARACTER_PORTRAYED_BY"])
        loader.add_css("performance_model",
                       selectors["CHARACTER_PERFORMANCE_MODEL"])
        loader.add_css("inspiration", selectors["CHARACTER_INSPIRATION"])
        loader.add_css("awards", selectors["CHARACTER_AWARDS"])
        loader.add_css("fullname", selectors["CHARACTER_FULLNAME"])
        loader.add_css("other_names", selectors["CHARACTER_OTHER_NAMES"])
        loader.add_css("occupation", selectors["CHARACTER_OCCUPATION"])
        loader.add_css("affiliations", selectors["CHARACTER_AFFILIATIONS"])
        loader.add_css("home", selectors["CHARACTER_HOME"])
        loader.add_css("likes", selectors["CHARACTER_LIKES"])
        loader.add_css("dislikes", selectors["CHARACTER_DISLIKES"])
        loader.add_css("powers", selectors["CHARACTER_POWERS"])
        loader.add_css("paraphernalia", selectors["CHARACTER_PARAPHERNALIA"])
        loader.add_css("status", selectors["CHARACTER_STATUS"])
        loader.add_css("parents", selectors["CHARACTER_PARENTS"])
        loader.add_css("siblings", selectors["CHARACTER_SIBLINGS"])
        loader.add_css("family", selectors["CHARACTER_FAMILY"])
        loader.add_css("partner", selectors["CHARACTER_PARTNER"])
        loader.add_css("children", selectors["CHARACTER_CHILDREN"])
        loader.add_css("pets", selectors["CHARACTER_PETS"])

        if len(loader.get_css(selectors["CHARACTER_NAME"])) < 1:
            loader.add_css("name", selectors["PAGE_HEADER_TITLE"])

        if len(loader.get_css(selectors["CHARACTER_IMAGE"])) < 1:
            loader.add_css("image", selectors["CHARACTER_THUMB_IMAGE"])

        logging.info("Crawl %s" % loader.get_collected_values("name"))

        char = loader.load_item()
        yield char

Beispiel #16

0

Datei anzeigen

Datei: hbsmith_kaufmanswv_auctionfirst_results.py Projekt: rusrom/scrapy_larsen_1_provider

    def parse_lot(self, response):
        l = ItemLoader(
            item=HbarrysmithKaufmanauctionswvAuctionsfirstResultItem(),
            response=response)
        l.default_output_processor = TakeFirst()

        l.add_xpath('LotNum', '//span[@class="lot-num"]/text()')
        l.add_xpath('Lead', '//span[@class="lot-name"]/text()')
        l.add_xpath(
            'Description',
            'string(//div[contains(@class, "description-info-content")])')
        l.add_xpath(
            'Price',
            '//span[@id and contains(text(), "Lot closed - High bid:")]/span/text()'
        )
        l.add_value('Sale', l.get_collected_values('Price'))

        yield l.load_item()

Beispiel #17

0

Datei anzeigen

 def get_news(self, response):
     l = ItemLoader(item=SpiderItem(), response=response)
     l.add_value(
         'title',
         response.xpath('//div[@class="article_title"]/text()').extract())
     l.add_value(
         'date',
         response.xpath('//div[@class="article_title1"]/text()').extract())
     r1 = r"\d{1,4}"
     date0 = re.compile(r1)
     date = ''.join(l.get_collected_values('date'))
     date1 = date0.findall(date)
     date1 = date1[0] + '-' + date1[1] + '-' + date1[2]
     l.replace_value('date', date1)
     l.add_value(
         'content',
         response.xpath('//div[@id="MyContent"]/p/span/text()').extract())
     l.add_value(
         'content',
         response.xpath(
             '//div[@id="MyContent"]/p/font/span/text()').extract())
     l.add_value(
         'content',
         response.xpath('//p[@class="MsoNormal"]/span/span/font/span/text()'
                        ).extract())
     l.add_value(
         'content',
         response.xpath(
             '//p[@class="MsoNormal"]/span/span/font/text()').extract())
     l.add_value(
         'content',
         response.xpath('//div[@class="article_intro"]/text()').extract())
     l.add_value(
         'content',
         response.xpath('//div[@id="MyContent"]/p/font/text()').extract())
     l.add_value(
         'content',
         response.xpath('//p[@id="MsoNormal"]/span/text()').extract())
     l.add_value('url', response.url)
     l.add_value('collection_name', self.name)
     l.add_value('website', self.website)
     return l.load_item()

Beispiel #18

0

Datei anzeigen

 def get_news(self, response):
     l = ItemLoader(item=SpiderItem(), response=response)
     l.add_value(
         'title',
         response.xpath(
             '//div[@id="lbyright_xwxq_title"]/text()').extract())
     l.add_value(
         'date',
         response.xpath('//div[@id="lbyright_xwxq_xxx"]/text()').extract())
     r1 = r"\d{4}\-\d{1,2}\-\d{1,2}"
     date0 = re.compile(r1)
     date = ''.join(l.get_collected_values('date'))
     date1 = date0.findall(date)
     l.replace_value('date', date1[0])
     l.add_value(
         'content',
         response.xpath(
             '//div[@id="lbyright_xwxq_txt"]/p/span/text()').extract())
     l.add_value('url', response.url)
     l.add_value('collection_name', self.name)
     l.add_value('website', self.website)
     return l.load_item()

Beispiel #19

0

Datei anzeigen

    def get_news(self,response):
        l = ItemLoader(item=SpiderItem(),response=response)
        l.add_value('title', response.xpath('//h2[@class="titleH2"]/text()').extract())
        l.add_value('title', response.xpath('//div[@class="Article-Left"]/h3/text()').extract())
        l.add_value('title', response.xpath('//div[@class="tit"]/h1/text()').extract())

        l.add_value('date',response.xpath('//div[@class="from"]/span/text()').extract())
        l.add_value('date',response.xpath('//div[@class="CopyFrom"]/text()').extract())
        l.add_value('date',response.xpath('//div[@class="auther-from"]/text()').extract())
        r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}\:\d{2}"
        date0 = re.compile(r1)
        date = ''.join(l.get_collected_values('date'))
        date1 = date0.findall(date)
        l.replace_value('date', date1[0])
        l.add_value('content',response.xpath('//div[@class="content"]/p/text()').extract())
        l.add_value('content',response.xpath('//div[@class="content"]/p/font/text()').extract())
        l.add_value('content',response.xpath('//div[@class="content"]/text()').extract())

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        l.add_value('website', self.website)
        return l.load_item()

Beispiel #20

0

Datei anzeigen

Datei: ygw_spider.py Projekt: ZhangJun93/web_news

    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@class="wh645 left"]/p[1]/text()').extract())
	    l.add_value('title', response.xpath('//p[@class="f22 lh30 yahei"]/a/text()').extract())
	    l.add_value('title', response.xpath('//p[@class="f22 lh40 fb"]/text()').extract())

            l.add_value('date',response.xpath('//p[@class="lh30 left f14 yahei"]/text()').extract())
	    l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract())
	    l.add_value('title', response.xpath('//div[@class="top"]/p/text()').extract())

            date = ''.join(l.get_collected_values('date'))
            #date = time.strptime(date.split()[0], '%Y-%m-%d')
            #l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))

            l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/text()').extract())
            l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="sanji_left"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="news-con"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="news-con"]/div/div/div/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/font/font/p/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
	    url = response.url

            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()

Beispiel #21

0

Datei anzeigen

Datei: official_spider.py Projekt: sunny8898/homeland

    def parse_article(self, response):
        loader = ItemLoader(item=OfficialItem(), response=response)

        index = response.meta.get("index")
        title = response.meta.get('title', None)
        tags_list = response.meta.get('tags_list')
        block_type = ",".join(tags_list)

        # 文章中需要提取的信息，标题，详细时间，内容，作者，来源
        article = response.xpath("//div[@class='article']")
        if not title:
            loader.add_xpath("title", ".//h1[@class='arti-title']//text()")
        else:
            loader.add_value("title", title)
        article_metas = article.xpath(
            ".//p[@class='arti-metas']//span//text()").extract()
        loader.add_value("detail_time", article_metas[0])
        loader.add_value("author", article_metas[1], re='作者：(.*)')
        loader.add_value("block_type", block_type)
        loader.add_value("content", response.xpath("//div[@id='content']"))
        loader.add_xpath("img", "//div[@id='content']//@src")
        loader.add_value("article_url", response.url)
        loader.add_value("tags_list", tags_list)
        loader.add_value("index", index)

        imgs = loader.get_collected_values("img")
        if imgs:
            for img in imgs:
                if "http" in img:
                    yield Request(img,
                                  callback=self.parse_img,
                                  dont_filter=True,
                                  meta={
                                      "type": "image",
                                      "article_url": response.url
                                  })

        yield loader.load_item()

Beispiel #22

0

Datei anzeigen

Datei: dzw_spider.py Projekt: taianjianbing/web_news

    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@class="layout"]/h2/text()').extract())
	    l.add_value('title', response.xpath('//div[@id="wrapper"]/h1/text()').extract())
	    l.add_value('title', response.xpath('//div[@class="top"]/h1/text()').extract())

            l.add_value('date',response.xpath('//div[@class="layout"]/div/text()').extract())
	    l.add_value('date',response.xpath('//div[@class="left"]/span/text()').extract())
	    l.add_value('title', response.xpath('//div[@class="top"]/p/text()').extract())

            date = ''.join(l.get_collected_values('date'))
            date = time.strptime(date.split()[0], '%Y-%m-%d')
            l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))

            l.add_value('content',response.xpath('//div[@class="news-con"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="news-con"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="news-con"]/div/div/div/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/font/font/p/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
	    url = response.url

            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()

Beispiel #23

0

Datei anzeigen

Datei: liepin_crawlSpider.py Projekt: zuiaimenger/chinahr-web

    def parse_info(self, response):
        loaderJob = ItemLoader(item=JobInfoItem(), response=response)
        loaderJob.add_value("url", value=response.url)
        loaderJob.add_value("job_category", value=urllib.unquote(response.meta["category"]))
        loaderJob.add_xpath("job_name", '//div[@class="title-info over"]/h1/text()')
        loaderJob.add_xpath("job_name", '//div[@class="title-info "]/h1/text()')
        loaderJob.add_xpath("job_company", '//div[@class="title-info over"]/h3/text()')
        loaderJob.add_xpath("job_company", '//div[@class="title-info "]/h3/text()')
        loaderJob.add_xpath("job_company", '//div[@class="title-info "]/h3/a/text()')
        loaderJob.add_xpath("job_miniEdu", '//div[@class="resume clearfix"]/span/text()', TakeNumL(0))
        loaderJob.add_xpath("job_experience", '//div[@class="resume clearfix"]/span/text()', TakeNumL(1))
        loaderJob.add_xpath("job_reqLan", '//div[@class="resume clearfix"]/span/text()', TakeNumL(2))
        loaderJob.add_xpath("job_reqAge", '//div[@class="resume clearfix"]/span/text()', TakeNumL(3))
        loaderJob.add_xpath("job_salary", '//p[@class="job-main-title"]/text()', TakeFirstL())
        loaderJob.add_xpath("job_location", '//p[@class="basic-infor"]/span[1]/text()', TakeFirstL())
        loaderJob.add_xpath("job_update", '//p[@class="basic-infor"]/span[2]/text()', TakeFirstL(), re=u"(?<=发布于：).*")
        loaderJob.add_xpath(
            "job_desc", '//div[@class="content content-word"][1]', RemoveTagsL(), StripBlankL(), JoinL("")
        )
        loaderJob.add_xpath(
            "job_benefits",
            '//div[@class="job-main main-message"]',
            RemoveTagsL(),
            ReplaceBlank(),
            re=u"(?<=薪酬福利：)[\s\S]*",
        )
        loaderJob.add_xpath("job_benefits", '//div[@class="tag-list clearfix"]/span/text()', JoinL("|"))
        yield loaderJob.load_item()

        if "job.liepin.com" in response.url:
            loaderCom = ItemLoader(item=ComInfoItem(), response=response)
            loaderCom.add_value("url", value=response.url)
            loaderCom.add_value("com_name", value=loaderJob.get_collected_values("job_company"))
            loaderCom.add_xpath(
                "com_industry",
                '//div[@class="right-post-top"]/div[@class="content content-word"]/a[1]/@title',
                TakeFirstL(),
            )
            loaderCom.add_xpath(
                "com_size",
                '//div[@class="right-post-top"]/div[@class="content content-word"]',
                RemoveTagsL(),
                re=u"(?<=规模：)[\s\S]*?(?=<br>)",
            )
            loaderCom.add_xpath(
                "com_nature",
                '//div[@class="right-post-top"]/div[@class="content content-word"]',
                RemoveTagsL(),
                re=u"(?<=性质：)[\s\S]*?(?=<br>)",
            )
            loaderCom.add_xpath(
                "com_address",
                '//div[@class="right-post-top"]/div[@class="content content-word"]',
                RemoveTagsL(),
                re=u"(?<=地址：)[\s\S]*",
            )
            loaderCom.add_xpath(
                "com_intro",
                '//div[@class="job-main main-message noborder "]/div[@class="content content-word"]/text()',
                StripBlankL(),
                TakeFirstL(),
            )
            yield loaderCom.load_item()

Beispiel #24

0

Datei anzeigen

Datei: comments.py Projekt: laols574/fbcrawl

    def parse_post(self, response):
        '''
        parse post does multiple things:
            1) loads replied-to-comments page one-by-one (for DFS)
            2) call parse_reply on the nested comments
            3) adds simple (not-replied-to) comments
            4) follows to new comment page
        '''

        #load replied-to comments pages
        #select nested comment one-by-one matching with the index: response.meta['index']
        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str(
            response.meta['index']) + ']'
        group_flag = response.meta[
            'group'] if 'group' in response.meta else None

        for reply in response.xpath(path):
            source = reply.xpath('.//h3/a/text()').extract()
            answer = reply.xpath(
                './/a[contains(@href,"repl")]/@href').extract()
            ans = response.urljoin(answer[::-1][0])
            self.logger.info('{} nested comment'.format(
                str(response.meta['index'])))
            yield scrapy.Request(ans,
                                 callback=self.parse_reply,
                                 priority=1000,
                                 meta={
                                     'reply_to': source,
                                     'url': response.url,
                                     'index': response.meta['index'],
                                     'flag': 'init',
                                     'group': group_flag
                                 })

        #load regular comments
        if not response.xpath(path):  #prevents from exec
            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
            for i, reply in enumerate(response.xpath(path2)):
                self.logger.info('{} regular comment'.format(i + 1))
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                """ 
                PROFILE REACTIONS SECTION
                adds functionality for adding profile and specific reaction data
                gets the profile url, creates a new item
                if the profile exists, add info to new item and increment 'check'
                to signal that new information has been added to the item
                and it's already been yielded
                repeat this process for reactions
                """

                #profile = response.xpath(".//h3/a/@href")
                #profile = response.urljoin(profile[0].extract())

                profile = "https://mbasic.facebook.com" + new.get_collected_values(
                    'source_url')[0]
                #print('profile', profile)
                #print('new item', new.get_collected_values('name'))

                item = new.load_item()
                check = 0
                if profile:
                    check += 1
                    yield scrapy.Request(profile,
                                         callback=self.parse_profile,
                                         meta={'item': item})

                temp = ItemLoader(item=CommentsItem(), selector=reply)
                temp.context['lang'] = self.lang

                temp.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]/@href')
                reactions = temp.get_collected_values('reactions')
                if reactions:
                    check += 1
                    reactions = "https://mbasic.facebook.com" + temp.get_collected_values(
                        'reactions')[0]
                    temp = 0
                    yield scrapy.Request(reactions,
                                         callback=self.parse_reactions,
                                         meta={'item': item})

                if check == 0:
                    yield item

        #new comment page
        if not response.xpath(path):
            #for groups
            next_xpath = './/div[contains(@id,"see_next")]'
            prev_xpath = './/div[contains(@id,"see_prev")]'
            if not response.xpath(next_xpath) or group_flag == 1:
                for next_page in response.xpath(prev_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': 1
                                         })
            else:
                for next_page in response.xpath(next_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': group_flag
                                         })

Beispiel #25

0

Datei anzeigen

Datei: gmw.py Projekt: qiangber/scrapy_news

    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                "title",
                response.xpath(
                    '//h1[@id="articleTitle"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//div[@id="articleTitle"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//h2[@id="toptitle"]/a/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//div[@class="tit_dt"]/b/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//div[@id="ArticleTitle"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//h1[@class="picContentHeading"]/text()').extract_first())

            date = response.xpath(
                '//span[@id="pubTime"]/text()').extract_first()
            if date:
                loader.add_value("date", date + ":00")
            loader.add_value(
                "date", ''.join(
                    response.xpath('//div[@id="ArticleSourceAuthor"]/text()').
                    extract()).strip()[:19])
            if loader.get_collected_values("date") == '':
                end = response.url.find('/content_')
                loader.add_value(
                    "date",
                    response.url[end - 10:end].replace('/', '-') + " 00:00:00")

            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@id="contentMain"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@style="padding:15px 15px;line-height:28px;"]/descendant-or-self::text()'
                    ).extract()))
            loader.add_value(
                "content", ''.join(
                    response.xpath('//div[@class="con_dt"]/descendant::text()'
                                   ).extract()))
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@id="ArticleContent"]/descendant::text()').
                    extract()))

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()

Beispiel #26

0

Datei anzeigen

    def parse_post(self, response):
        # Web elements to extract post
        thread_item = response.meta['thread_item']
        thread_loader = ItemLoader(item=thread_item)
        threadtitle = thread_loader.get_collected_values('threadtitle')

        for product in response.xpath("//div[contains(@class, 'post_block')]"):

            loader = ItemLoader(item=PostItem(), selector=product)
            loader.add_value('threadtitle', threadtitle)

            temp = product.css(
                "div.post_block div.post_wrap div.post_body").extract()
            temp = re.sub('<br>|<strong>|<\/strong>|<em>|<\/em>', ' ', temp[0])
            temp = re.sub('\n', ' ', temp)
            temp = re.sub('<blockquote(.*?)blockquote>', ' ', str(temp))

            selector = scrapy.Selector(text=str(temp))
            loader.add_value(
                "postcontent",
                selector.xpath(
                    "//div[contains(@class,'post_body')]/div[@itemprop='commentText'][1]"
                ).extract())

            loader.add_value(
                "authorname",
                product.css(
                    "div.post_wrap div.author_info div.user_details span[itemprop='name']::text"
                ).get(default='N/A'))
            loader.add_value(
                "authortype",
                product.css(
                    "div.post_wrap div.author_info div.user_details li.group_title::text"
                ).get())
            loader.add_value(
                "noposts",
                product.css(
                    "div.post_wrap div.author_info div.user_details li.post_count::text"
                ).get())
            if len(
                    product.css("div.post_wrap div.post_body div.signature").
                    getall()) > 0:
                loader.add_value(
                    "authorsign",
                    product.css(
                        "div.post_wrap div.post_body div.signature").getall())

            else:
                loader.add_value("authorsign", ['N/A'])

            loader.add_value(
                "date",
                product.css(
                    "div.post_wrap div.post_body p.posted_info abbr.published::text"
                ).get())

            yield loader.load_item()

        next_page = response.xpath(
            "//div[contains(@class, 'topic_controls')]/div[contains(@class, 'pagination')]/"
            "ul[contains(@class, 'forward')]/li[contains(@class, 'next')]/a/@href"
        ).extract_first()
        if next_page is not None:
            yield response.follow(next_page,
                                  callback=self.parse_post,
                                  meta={'thread_item': thread_item})  #

Beispiel #27

0

Datei anzeigen

    def get_news(self, response):
        try:
            loader = ItemLoader(item=SpiderItem(), response=response)

            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="text"]/h1/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath('//div[@class="text_c clearfix"]/h1/text()').
                extract_first())
            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="text_c"]/h1/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath('//div[@class="d2_left wb_left fl"]/h1/text()').
                extract_first())

            loader.add_value(
                'date',
                response.xpath(
                    '//p[@class="text_tools"]/text()').extract_first())
            loader.add_value(
                'date',
                response.xpath('////div[@class="text_c clearfix"]/h5/text()').
                extract_first())
            loader.add_value(
                'date',
                response.xpath('//p[@class="sou"]/text()').extract_first())
            loader.add_value(
                'date',
                response.xpath(
                    '//span[@id="p_publishtime"]/text()').extract_first())
            date = ''.join(loader.get_collected_values('date'))
            date = time.strptime(date.split()[0], '%Y年%m月%d日%H:%M')
            loader.replace_value('date', time.strftime('%Y-%m-%d', date))

            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="text_c"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="text_show"]/descendant-or-self::text()'
                    ).extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="show_text"]/descendant-or-self::text()'
                    ).extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@id="p_content"]/descendant-or-self::text()').
                    extract()))

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01')
            l.add_value('source', '')
            l.add_value('content', '')

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        l.add_value('website', self.website)
        return l.load_item()

Beispiel #28

0

Datei anzeigen

Datei: comments.py Projekt: laols574/fbcrawl

    def parse_reply(self, response):
        '''
        parse reply to comments, root comment is added if flag
        '''
        #        from scrapy.utils.response import open_in_browser
        #        open_in_browser(response)

        if response.meta['flag'] == 'init':
            #parse root comment
            for root in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=root)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_value('reply_to', 'ROOT')
                new.add_xpath('text', './/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                #response --> reply/root
                """
                PROFILE REACTIONS SECTION (REPEAT SEE LINE 176 )
                the only difference is that, when getting the item temporarily
                the selector is the root instead of the reply, (it matches the for loop)
                """
                #profile = response.xpath(".//h3/a/@href")
                #profile = response.urljoin(profile[0].extract())
                profile = "https://mbasic.facebook.com" + new.get_collected_values(
                    'source_url')[0]
                print('profile', profile)
                #print('new item', new.get_collected_values('name'))
                item = new.load_item()
                check = 0
                if profile:
                    check += 1
                    yield scrapy.Request(profile,
                                         callback=self.parse_profile,
                                         meta={'item': item})

                #reactions = new.get_value('reactions')
                #print("reactions",reactions)

                temp = ItemLoader(item=CommentsItem(), selector=root)
                temp.context['lang'] = self.lang

                temp.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]/@href')
                reactions = temp.get_collected_values('reactions')
                if reactions:
                    check += 1
                    reactions = "https://mbasic.facebook.com" + temp.get_collected_values(
                        'reactions')[0]
                    temp = 0
                    yield scrapy.Request(reactions,
                                         callback=self.parse_reactions,
                                         meta={'item': item})

                if check == 0:
                    yield item

            #parse all replies in the page
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                """
                PROFILE REACTIONS SECTION SECTION (REPEAT SEE LINE 176)
                """
                #profile = response.xpath(".//h3/a/@href")
                #profile = response.urljoin(profile[0].extract())
                profile = "https://mbasic.facebook.com" + new.get_collected_values(
                    'source_url')[0]

                #print('new item', new.get_collected_values('name'))
                item = new.load_item()
                check = 0
                if profile:
                    check += 1
                    yield scrapy.Request(profile,
                                         callback=self.parse_profile,
                                         meta={'item': item})

                temp = ItemLoader(item=CommentsItem(), selector=reply)
                temp.context['lang'] = self.lang

                temp.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]/@href')
                reactions = temp.get_collected_values('reactions')
                if reactions:
                    check += 1
                    reactions = "https://mbasic.facebook.com" + temp.get_collected_values(
                        'reactions')[0]
                    temp = 0
                    yield scrapy.Request(reactions,
                                         callback=self.parse_reactions,
                                         meta={'item': item})

                if check == 0:
                    yield item

            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            if back:
                self.logger.info('Back found, more nested comments')
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to proper page: {}'
                    .format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })

        elif response.meta['flag'] == 'back':
            """
            adds random time pauses to prevent blocking
            DOWNSIDE: the algorithm will go slower, but still
            runs pretty quickly
            the greater the length of time, the more 
            likely you'll go undetected, but if you're using a large amount 
            of data, this may be unreasonable
            """
            #print("did we make it")
            r = randrange(0, 20)
            time.sleep(r)
            #parse all comments
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                #print("reply")
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                """
                SECTION (REPEAT SEE LINE 176)
                """

                profile = "https://mbasic.facebook.com" + new.get_collected_values(
                    'source_url')[0]

                #profile = response.xpath(".//h3/a/@href")
                #profile = response.urljoin(profile[0].extract())
                #print('profile', profile)
                #print('new item', new.get_collected_values('name'))
                check = 0
                item = new.load_item()
                if profile:
                    check += 1
                    print(1)
                    yield scrapy.Request(profile,
                                         callback=self.parse_profile,
                                         meta={'item': item})

                #response --> reply/root
                #print("before ", item)
                temp = ItemLoader(item=CommentsItem(), selector=reply)
                temp.context['lang'] = self.lang

                temp.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]/@href')
                reactions = temp.get_collected_values('reactions')
                if reactions:
                    check += 1
                    reactions = "https://mbasic.facebook.com" + temp.get_collected_values(
                        'reactions')[0]
                    temp = 0
                    print(2)
                    yield scrapy.Request(reactions,
                                         callback=self.parse_reactions,
                                         meta={'item': item})

                if check == 0:
                    print(3)
                    yield item
                #print("after ", item)

            #keep going backwards
            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            self.logger.info('Back found, more nested comments')
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to home page: {}'.
                    format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })

Beispiel #29

0

Datei anzeigen

 def getInfo(self, res):
     if not mch(res):
         return
     response = etree.HTML(res.text)
     loader = ItemLoader(item=booking.Booking(), response=res)
     supplier_obj_id = res.meta.get('statics.hotels.id')
     supplier_name = res.meta.get('statics.hotels.supplier')
     if supplier_obj_id:
         loader.add_value('statics_hotels_id', supplier_obj_id)
         loader.add_value('statics_hotels_supplier', supplier_name)
     pic = []
     for e in self.allXpath:
         Xpath = eval('bk.' + e)
         fielName, lable = '_'.join(e.split('_')[:-1]), e.split('_')[-1]
         tempResult = ''
         if lable == 'non':
             if response.xpath(Xpath):
                 tempResult = response.xpath(Xpath)[0].strip()
         elif lable == 'ren':
             if re.findall(Xpath, res.text):
                 tempResult = re.findall(Xpath, res.text)[0].strip()
         elif lable == 'rea':
             if re.findall(Xpath, res.text):
                 for each in re.findall(Xpath, res.text):
                     tempResult += each.strip()
         elif lable == 'sub':
             if response.xpath(Xpath):
                 tempResult = re.sub(
                     '\\n+', '\\n',
                     response.xpath(Xpath)[0].xpath('string(.)')).strip()
         elif lable == 'sua':
             selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split(
                 'weego')[1], Xpath.split('weego')[2:]
             for each in response.xpath(selects):
                 temp = each.xpath(subSelcets)
                 if isinstance(temp, list):
                     tempResult += temp[0]
                 elif isinstance(temp, str):
                     tempResult += temp
             tempResult = re.sub('\\n+', '\\n', tempResult).strip()
         elif lable == 'pic':
             selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split(
                 'weego')[1], Xpath.split('weego')[2:]
             for each in response.xpath(selects):
                 temp = each.xpath(subSelcets)
                 pic.append(temp[0])
             tempResult = pic
         elif lable == 'pir':
             for each in re.findall(Xpath, res.text):
                 pic.append(each)
             tempResult = pic
         elif lable == 'xpl':
             selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split(
                 'weego')[1], Xpath.split('weego')[2:]
             tl = []
             for each in response.xpath(selects):
                 temp = re.sub('\\n+', ' - ',
                               each.xpath(subSelcets).strip())
                 tl.append(temp)
             loader.add_value(fielName.lower(), tl)
         if lable != 'xpl':
             if loader.get_collected_values(fielName.lower()):
                 if loader.get_collected_values(fielName.lower())[0] == '':
                     loader.replace_value(fielName.lower(), tempResult)
             else:
                 loader.add_value(fielName.lower(), tempResult)
     yield loader.load_item()

Beispiel #30

0

Datei anzeigen

Datei: conestogaconca.py Projekt: rusrom/scrapy_continuing_education

    def parse_course(self, response):
        l = ItemLoader(item=ConestogacCourseItem(), response=response)
        l.default_output_processor = TakeFirst()

        course_data = response.xpath('//div[@data-accordion][1]')

        l.add_value('institution_name', 'Conestoga College')
        l.add_xpath('course_code', '//div[@class="hero-banner"]//span/text()')
        l.add_xpath('course_name',
                    '//h1[contains(@class, "text-white")]/text()')
        l.add_value(
            'delivery_types',
            course_data.xpath(
                './/small[strong[contains(text(), "Delivery:")]]/following-sibling::small/text()'
            ).get())
        l.add_value('url', response.url)
        # l.add_value('faculty', '???????????')
        l.add_xpath(
            'description',
            '//h2[contains(text(), "Course description")]/following-sibling::p[1]/text()'
        )

        price = course_data.xpath(
            './/small[strong[contains(text(), "Cost:")]]/following-sibling::small/text()'
        ).get()
        if price:
            price = price.lstrip('$')
        else:
            price = '0.0'
        l.add_value('price', [price])

        weekday_time_data = course_data.xpath(
            './/small[strong[contains(text(), "Day/Time:")]]/following-sibling::small/text()'
        ).getall()
        if not weekday_time_data:
            return False
        weekday_time_data = [
            remove_garbage(data) for data in weekday_time_data
        ]
        # ['Thurs. 9:00am – 4:00pm', 'Fri. 9:00am – 4:00pm']
        weekday_time_data = [
            data for data in weekday_time_data if len(data) > 1
        ]

        if weekday_time_data:
            weekdays = [
                re.search(r'(^\w+)', d).group(1)
                if re.search(r'(^\w+)', d) else '' for d in weekday_time_data
            ]
            weekdays = [d for d in weekdays if d]
        else:
            weekdays = []

        l.add_value('days', [weekdays])
        l.add_value(
            'prerequisite',
            response.xpath(
                '//strong[contains(text(), "Prerequisites:")]/following-sibling::a/text()'
            ).getall())
        l.add_value(
            'corequisites',
            response.xpath(
                '//strong[contains(text(), "Corequisites:")]/following-sibling::a/text()'
            ).getall())
        l.add_value('program', 'Continuing Education')

        if weekday_time_data:
            duration_hours_list = [
                re.findall(r'\d{1,2}:\d{1,2}\w{2}', t)
                for t in weekday_time_data
            ]
        else:
            duration_hours_list = []
        l.add_value('duration_hours', duration_hours_list)
        l.add_value('duration_days_week', l.get_collected_values('days'))

        start_date = course_data.xpath(
            './/small[strong[contains(text(), "Start Date:")]]/following-sibling::small/text()'
        ).get()
        if start_date:
            start_date = re.sub(r'(\s*\.\s+|\s*,\s+)', '-', start_date)
            start_date = datetime.strptime(start_date, '%b-%d-%Y')

        end_date = course_data.xpath(
            './/small[strong[contains(text(), "End date:")]]/following-sibling::small/text()'
        ).get()
        if start_date:
            end_date = re.sub(r'(\s*\.\s+|\s*,\s+)', '-', end_date)
            end_date = datetime.strptime(end_date, '%b-%d-%Y')

        duration_month_list = [[start_date, end_date]]

        l.add_value('duration_months', duration_month_list)
        l.add_value('duration_as_string', [
            l.get_collected_values('duration_hours'),
            l.get_collected_values('duration_days_week'),
            l.get_collected_values('duration_months'),
        ])

        hours_site = course_data.xpath(
            './/small[strong[contains(text(), "Hours:")]]/following-sibling::small/text()'
        ).get()
        if not hours_site:
            hours_site = 0

        l.add_value('total_hours', [
            l.get_collected_values('duration_hours'),
            l.get_collected_values('duration_days_week'),
            hours_site,
        ])

        yield l.load_item()

Beispiel #31

0

Datei anzeigen

Datei: camosun.py Projekt: rusrom/scrapy_continuing_education

    def parse_program(self, response):
        programs = response.xpath('//div[h2[@id]]')
        for program_block in programs:
            program = program_block.xpath('./h2/text()').get()

            program_block_html_string = program_block.get()
            program_block_html_string = re.sub(r'^\s*<div>\s*', '',
                                               program_block_html_string)
            program_block_html_string = re.sub(r'\s*</div>\s*$', '',
                                               program_block_html_string)

            courses = program_block_html_string.split('<hr class="modest">')
            courses = [el for el in courses if el]

            for course_html in courses:

                course = Selector(text=course_html)
                l = ItemLoader(item=CamosunCourseItem())
                # l.default_input_processor = MapCompose(lambda x: x.strip())
                l.default_output_processor = Join(' | ')

                course = course.xpath(
                    '//h3[@id and not(following-sibling::p[contains(@class, "alert-info")]) and not(following-sibling::del)]'
                )
                # If in block tere is no matching h3 element skip this element
                if not course:
                    continue

                l.add_value('institution_name', 'Camosun College')
                l.add_value('course_code', course.xpath('./@id').get())
                l.add_value('course_name', course.xpath('./text()').get())
                l.add_value('delivery_types', 'Onsite')
                l.add_value('url', response.url)
                l.add_value('faculty', response.meta['faculty'])
                l.add_value(
                    'description',
                    course.xpath('./following-sibling::p[1]//text()').getall())

                ul_blocks = course.xpath(
                    './following-sibling::ul[contains(string(), "$")]')

                # Skip course if no ul block with days and price
                if not ul_blocks:
                    continue

                ul_data = []
                dates_data = []
                for ul in ul_blocks:
                    # Parse weekdays and times
                    ul_string = remove_tags(ul.get())
                    ul_string = re.sub(r'\s{2,}', ' ', ul_string)
                    ul_string = remove_garbage(ul_string)
                    ul_string = ul_string.strip()
                    ul_data.append(ul_string)

                    # Parse dates text node
                    date_string = ul.xpath(
                        './preceding-sibling::text()[1]').get('')
                    date_string = remove_garbage(date_string)
                    # 1s check get we dates or just catch the bullets
                    if len(date_string) < 5:
                        date_string = ul.xpath(
                            '(./preceding-sibling::text()[2])').get('')
                        date_string = remove_garbage(date_string)
                    # Remove garbage till 2019
                    re_search = re.search(r'^(.+)2019', date_string)
                    if re_search:
                        remove_pattern = re.escape(re_search.group(1))
                        date_string = re.sub(remove_pattern, '', date_string)
                    # Write to list of dates only string that contains 2019
                    if '2019' in date_string:
                        dates_data.append(date_string.strip())

                prices = [
                    re.search(r'\$(\d+)', p).group(1) if re.search(
                        r'\$(\d+)', p) else '0.0' for p in ul_data if p
                ]
                l.add_value('price', prices)
                # l.add_value('subject', ul_data)

                # Get strings weekdays
                # Remove string not containing time
                weekdays = [
                    wd if re.search(r'\d+:\d+\w{2}', wd) else ''
                    for wd in ul_data if wd
                ]
                # Get string with weekday
                weekdays = [
                    re.search(r'^[^\d]+', wd).group() if re.search(
                        r'^[^\d]+', wd) else [] for wd in weekdays if wd
                ]
                # Clear from bullets at the end of string
                weekdays = [re.sub(r'\W+$', '', i) for i in weekdays if i]
                # Clear from empty string after above clearing
                weekdays = [wd.split(' ') for wd in weekdays if wd]
                l.add_value('days', weekdays)
                l.add_value('program', program)

                # Get time in gropu like DD:DDam-DD:DDam
                duration_hours = [
                    re.findall(r'(\d+:\d+\w{2}-\d+:\d+\w{2})', tm)
                    for tm in ul_data if tm
                ]
                # Plepare list for time like [['6:30pm', '9:30pm'], ['8:30am', '4:30pm']]
                # duration_hours = [tm[0].split('-') for tm in duration_hours if tm]
                duration_hours_list = []
                for tm in duration_hours:
                    if not tm:
                        continue
                    if len(tm) > 1:
                        for interval in tm:
                            duration_hours_list.append(interval.split('-'))
                    else:
                        duration_hours_list.append(tm[0].split('-'))

                l.add_value('duration_hours', duration_hours_list)
                l.add_value('duration_days_week',
                            l.get_collected_values('days'))

                # Looking for month interval
                duration_month_list = []
                dur_month_tpl = '{year} {month}'
                for mon in dates_data:
                    if not mon:
                        continue
                    mon_res = re.search(
                        r'(2019).+(\w{3} \d+) - (\w{3} \d+)?|(2019).+(\w{3} \d+)',
                        mon)
                    if not mon_res:
                        continue
                    year, start_m, end_m, one_year, one_m = mon_res.groups()

                    if one_m:
                        m_start = dur_month_tpl.format(year=one_year,
                                                       month=one_m)
                        m_end = dur_month_tpl.format(year=one_year,
                                                     month=one_m)
                    else:
                        m_start = dur_month_tpl.format(year=year,
                                                       month=start_m)
                        m_end = dur_month_tpl.format(year=year, month=end_m)

                    duration_month_list.append([m_start, m_end])

                l.add_value('duration_months', duration_month_list)
                l.add_value('duration_as_string', [
                    l.get_collected_values('duration_hours'),
                    l.get_collected_values('duration_days_week'),
                    l.get_collected_values('duration_months'),
                ])

                l.add_value('total_hours', [
                    l.get_collected_values('duration_hours'),
                    l.get_collected_values('duration_days_week'),
                ])

                # l.add_value('corequisites', dates_data)

                yield l.load_item()