Beispiel #1
0
    def get_news(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        l.add_value(
            'title',
            response.xpath('//div[@id="contentwrap"]/h1/text()').extract())

        l.add_value('date',
                    response.xpath('//div[@class="infos"]/p/text()').extract())

        r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}\:\d{2}"
        date0 = re.compile(r1)
        date = ''.join(l.get_collected_values('date'))
        date1 = date0.findall(date)
        l.replace_value('date', date1[0])
        l.add_value('content',
                    response.xpath('//div[@class="content"]/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="description"]/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="content"]/p/text()').extract())
        l.add_value(
            'content',
            response.xpath('//div[@class="content"]/div/p/text()').extract())

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        return l.load_item()
Beispiel #2
0
    def get_news(self,response):
	try:
            l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//span[@id="thread_subject"]/text()').extract())

            l.add_value('date',response.xpath('//div[@class="authi"]/em/text()').extract())

            r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}"
	    date0 = re.compile(r1)
	    date = ''.join(l.get_collected_values('date'))
	    date1 = date0.findall(date)
            l.replace_value('date', date1[0])
            l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/br/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/font/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/p/font/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="t_fsz"]/table/tr/td/div/div/div/font/font/strong/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
Beispiel #3
0
    def get_news(self, response):
        l = ItemLoader(item=SpiderItem(), response=response)
        l.add_value('title',
                    response.xpath('//table/tr[3]/td[2]/text()').extract())

        l.add_value('date',
                    response.xpath('//table/tr[4]/td/text()').extract())

        r1 = r"\d{4}\-\d{1,2}\-\d{1,2}"
        date0 = re.compile(r1)
        date = ''.join(l.get_collected_values('date'))
        date1 = date0.findall(date)
        l.replace_value('date', date1[0])
        l.add_value(
            'content',
            response.xpath('//td[@class="tdbg"]/div/font/text()').extract())
        l.add_value(
            'content',
            response.xpath('//td[@class="tdbg"]/p/font/text()').extract())
        l.add_value(
            'content',
            response.xpath('//td[@class="tdbg"]/p/span/text()').extract())
        l.add_value('content',
                    response.xpath('//td[@class="tdbg"]/p/text()').extract())

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        l.add_value('website', self.website)
        return l.load_item()
Beispiel #4
0
    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@id="biaoti"]/text()').extract())
	    l.add_value('title', response.xpath('//h1[@id="biaoti"]/text()').extract())

            l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract())
	    l.add_value('date',response.xpath('//div[@class="center lh32 grey12a"]/text()').extract())
	    l.add_value('date',response.xpath('//div[@id="left"]/h2/text()').extract())

            l.add_value('content',response.xpath('//div[@id="zw"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="zw"]/strong/p/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
	    url = response.url
	    if url[11:17]=="shzfzz":
                date = ''.join(l.get_collected_values('date'))
                date = time.strptime(date.split()[0], u'%Y年%m月%d日')
                l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))
            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
    def parse_detail(self, response):
        t_name = response.xpath("//h1/a/text()").extract_first()
        for post in response.xpath("//li[@class='li_capsul_entry']"):
            l = ItemLoader(item=UludagtutorialItem(), selector=post)
            l.add_value("title", response.meta.get('title_name', t_name))
            l.add_xpath(
                "comment",
                ".//div[@class='entry-p']/text() | .//div[@class='entry-p']/a/text()"
            )
            l.add_xpath(
                "user",
                ".//div[@class='entry-secenekleri']/a[@class='alt-u yazar']/text()"
            )
            l.add_xpath("date", ".//span[@class='date-u']/a/text()")
            l.add_xpath(
                "url",
                "substring-after(.//div[@class='voting_nw']/a/@href, '//')")

            yield scrapy.FormRequest(
                "https://www.uludagsozluk.com/ax/?a=yenit&ne=ben&nw=pop",
                formdata={"benu": l.get_collected_values('user')[0]},
                method='POST',
                callback=self.parse_post_detail,
                dont_filter=True,
                meta={'l': l})

        next_page_url = response.xpath(
            "//a[@class='nextpage']/@href").extract_first()

        if next_page_url is not None:
            yield scrapy.Request("https://www.uludagsozluk.com" +
                                 next_page_url,
                                 callback=self.parse_detail,
                                 dont_filter=True)
Beispiel #6
0
    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value("title", response.xpath('//h1[@id="title"]/text()').extract_first())
            loader.add_value("title", response.xpath('//span[@id="title"]/text()').extract_first())

            loader.add_value("date", response.xpath('//span[@class="time"]/text()').extract_first())
            loader.add_value("date", response.xpath('//span[@id="pubtime"]/text()').extract_first())
            date = ''.join(loader.get_collected_values("date")).strip()
            date = time.strptime(date, '%Y年%m月%d日 %H:%M:%S')
            loader.replace_value("date", time.strftime("%Y-%m-%d %H:%M:%S", date))

            loader.add_value("content",
                             ''.join(response.xpath('//div[@id="content"]/descendant-or-self::text()').extract()))
            loader.add_value("content",
                             ''.join(response.xpath('//div[@class="article"]/descendant-or-self::text()').extract()))
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()
Beispiel #7
0
    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@class="l_tit"]/text()').extract())

            l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract())

            date = ''.join(l.get_collected_values('date'))
            #date = time.strptime(date.split()[0], '%Y-%m-%d')
            #l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))

            l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/strong/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="article"]/div/p/text()').extract())

 
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
Beispiel #8
0
    def parse_item(self, response):

        l = ItemLoader(item=PropertiesItem(), response=response)

        l.add_xpath('author', '//*[@id="main-content"]/div[1]/span[2]/text()')
        l.add_xpath('title', '//*[@id="main-content"]/div[3]/span[2]/text()')
        l.add_xpath('datetime',
                    '//*[@id="main-content"]/div[4]/span[2]/text()')
        l.add_xpath('board', '//*[@id="main-content"]/div[2]/span[2]/text()')

        l.add_xpath('category',
                    '//*[@id="main-content"]/div[3]/span[2]/text()',
                    re='^\[.+\]')
        if len(l.get_collected_values('category')) == 0:
            l.add_xpath('category',
                        '//*[@id="main-content"]/div[3]/span[2]/text()',
                        re='^Re')

        # Housekeeping fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('rtrv_date', datetime.datetime.now())

        return l.load_item()
Beispiel #9
0
    def parse_article(self, response):
        loader = ItemLoader(item=XfjyArticleItem(), response=response)

        article_url = response.url
        title = response.meta["title"]
        # date = response.meta["date"]
        tags_list = response.meta["tags_list"]
        block_type = ",".join(tags_list)

        # 暂时attachments放在这里
        attchments = response.xpath(
            "//div[@class='main_nei_you_baio_content']//span//a")
        names_urls = [(attchment.xpath(".//span//text()").extract_first(),
                       attchment.xpath(".//@href").extract_first())
                      for attchment in attchments]
        name_url = {name: response.urljoin(url) for name, url in names_urls}
        attchments = json.dumps(name_url, ensure_ascii=False)

        index = response.meta.get("index")

        loader.add_value("article_url", article_url)
        loader.add_value("title", title)
        loader.add_value("tags_list", tags_list)
        loader.add_value("block_type", block_type)
        loader.add_value("attch_name_url", attchments)
        loader.add_xpath(
            "author",
            "//div[@class='main_nei_you_baio_content']//span[@class='authorstyle44003']//text()"
        )
        loader.add_value(
            "content",
            response.xpath(
                "//div[@class='main_nei_you_baio_content']//td[@class='contentstyle44003']"
            ))
        loader.add_xpath(
            "img",
            "//div[@class='main_nei_you_baio_content']//td[@class='contentstyle44003']//@src"
        )
        loader.add_xpath(
            "detail_time",
            "//div[@class='main_nei_you_baio_content']//span[@class='timestyle44003']//text()"
        )
        loader.add_value("index", index)

        imgs = loader.get_collected_values("img")
        if imgs:
            for img in imgs:
                if "http" in img:
                    yield Request(img,
                                  callback=self.parse_img,
                                  dont_filter=True,
                                  meta={
                                      "type": "image",
                                      "article_url": response.url
                                  })

        yield loader.load_item()
Beispiel #10
0
    def get_news(self, response):
        try:
            loader = ItemLoader(item=SpiderItem(), response=response)

            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="left"]/h1/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath('//h1[@class="h1"]/text()').extract_first())

            loader.add_value(
                'date',
                response.xpath('//div[@class="zuoze"]/text()').extract_first())
            loader.add_value(
                'date',
                response.xpath(
                    '//span[@class="post-time"]/text()').extract_first())
            date = ''.join(loader.get_collected_values('date'))
            if date == '':
                return
            loader.replace_value('date', date.strip() + ":00")

            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//span[@id="zoom"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//p[@class="summary"]/descendant-or-self::text()').
                    extract()))

            loader.add_value('url', response.url)
            loader.add_value('collection_name', self.name)
            loader.add_value('website', self.website)

            yield loader.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            yield l.load_item()
Beispiel #11
0
    def parse(self, response):
        load_data = ItemLoader(item=GoogleSearchBlockData(), response=response)
        load_data.add_xpath('name', '//div[@class="BNeawe vvjwJb AP7Wnd"]/text()')
        load_data.add_xpath('url', '//div[@class="kCrYT"]//a/@href')
        load_data.add_xpath('next_site', '//a[@class="nBDE1b G5eFlf"]/@href')
        next_page = load_data.get_collected_values('next_site')

        try:
            next_page = next_page[next_page.__len__()-1]
            yield load_data.load_item()
            yield scrapy.Request(next_page, callback=self.parse)
        except IndexError:
            self.log('\n\n Moj LOGGER: \n'+ 'Zakończono pobieranie'+'\n\n')
Beispiel #12
0
    def parse_article_ajax(self, response):
        try:
            article = json.loads(response.body)["data"]["article"]
        except BaseException as e:
            self.log(response.body.decode(), level=logging.DEBUG)
            self.log("解析json过程出现错误,没有article,链接:{},错误:{}".format(
                response.url, str(e)),
                     level=logging.ERROR)
            self.log("{}".format(str(response.request.body)),
                     level=logging.ERROR)
        else:
            loader = ItemLoader(item=YibanArticleItem(), response=response)
            article_url = response.meta.get("article_url")
            title = response.meta.get("title")
            tags_list = [
                "易班",
            ]
            tags_list.append(article.get("Sections_name"))
            block_type = ",".join(tags_list)
            content = article.get("content")
            detail_time = article.get("createTime")

            # 易班的网站上面没有附件
            attchments = ""

            index = response.meta.get("index")

            loader.add_value("article_url", article_url)
            loader.add_value("title", title)
            loader.add_value("tags_list", tags_list)
            loader.add_value("block_type", block_type)
            loader.add_value("content", content)

            content_response = Selector(text=content)
            loader.add_value("img",
                             content_response.xpath("//img//@src").extract())
            loader.add_value("detail_time", detail_time)
            loader.add_value("index", index)

            imgs = loader.get_collected_values("img")
            if imgs:
                for img in imgs:
                    if "http" in img:
                        yield Request(img,
                                      callback=self.parse_img,
                                      dont_filter=True,
                                      meta={
                                          "type": "image",
                                          "article_url": article_url
                                      })
            yield loader.load_item()
Beispiel #13
0
    def parse(self, response):
        base_post = response.css('article.post_preview')

        for post in base_post:
            item = ItemLoader(authorItem(), response)
            for key, value in self.author_css_selectors.items():
                item.add_value(key, post.css(value).extract())

            yield item.load_item()
            yield response.follow(item.get_collected_values('author_url')[0] +
                                  'posts/',
                                  callback=self.parse_author)

        yield response.follow(response.css(
            self.line_post_css_selectors['post_url']).extract()[0],
                              callback=self.parse_post)
Beispiel #14
0
    def parse(self, response):

        l = ItemLoader(
            item=LinksItem(),
            response=response,
        )
        l.add_value('url', response.meta['source_url'])
        links = set([])
        for n in [1, 2]:
            selector = 'div.entrytext p:nth-child(%s) a ::attr(href)' % n
            for link in response.css(selector).extract():
                if '/tag/' not in link:
                    links.add(link)
        l.add_value('links', list(links))
        l.add_value('count', len(l.get_collected_values('links')))

        yield l.load_item()
    def parse_detail(self, response, char):
        loader = ItemLoader(item=char, response=response)

        loader.add_value("url", response.url)
        loader.add_css("image", selectors["CHARACTER_IMAGE"])
        loader.add_css("name", selectors["CHARACTER_NAME"])
        loader.add_css("feature_films", selectors["CHARACTER_FEATURE_FILMS"])
        loader.add_css("short_films", selectors["CHARACTER_SHORT_FILMS"])
        loader.add_css("shows", selectors["CHARACTER_SHOWS"])
        loader.add_css("games", selectors["CHARACTER_GAMES"])
        loader.add_css("rides", selectors["CHARACTER_RIDES"])
        loader.add_css("animator", selectors["CHARACTER_ANIMATOR"])
        loader.add_css("designer", selectors["CHARACTER_DESIGNER"])
        loader.add_css("voice", selectors["CHARACTER_VOICE"])
        loader.add_css("portrayed_by", selectors["CHARACTER_PORTRAYED_BY"])
        loader.add_css("performance_model",
                       selectors["CHARACTER_PERFORMANCE_MODEL"])
        loader.add_css("inspiration", selectors["CHARACTER_INSPIRATION"])
        loader.add_css("awards", selectors["CHARACTER_AWARDS"])
        loader.add_css("fullname", selectors["CHARACTER_FULLNAME"])
        loader.add_css("other_names", selectors["CHARACTER_OTHER_NAMES"])
        loader.add_css("occupation", selectors["CHARACTER_OCCUPATION"])
        loader.add_css("affiliations", selectors["CHARACTER_AFFILIATIONS"])
        loader.add_css("home", selectors["CHARACTER_HOME"])
        loader.add_css("likes", selectors["CHARACTER_LIKES"])
        loader.add_css("dislikes", selectors["CHARACTER_DISLIKES"])
        loader.add_css("powers", selectors["CHARACTER_POWERS"])
        loader.add_css("paraphernalia", selectors["CHARACTER_PARAPHERNALIA"])
        loader.add_css("status", selectors["CHARACTER_STATUS"])
        loader.add_css("parents", selectors["CHARACTER_PARENTS"])
        loader.add_css("siblings", selectors["CHARACTER_SIBLINGS"])
        loader.add_css("family", selectors["CHARACTER_FAMILY"])
        loader.add_css("partner", selectors["CHARACTER_PARTNER"])
        loader.add_css("children", selectors["CHARACTER_CHILDREN"])
        loader.add_css("pets", selectors["CHARACTER_PETS"])

        if len(loader.get_css(selectors["CHARACTER_NAME"])) < 1:
            loader.add_css("name", selectors["PAGE_HEADER_TITLE"])

        if len(loader.get_css(selectors["CHARACTER_IMAGE"])) < 1:
            loader.add_css("image", selectors["CHARACTER_THUMB_IMAGE"])

        logging.info("Crawl %s" % loader.get_collected_values("name"))

        char = loader.load_item()
        yield char
    def parse_lot(self, response):
        l = ItemLoader(
            item=HbarrysmithKaufmanauctionswvAuctionsfirstResultItem(),
            response=response)
        l.default_output_processor = TakeFirst()

        l.add_xpath('LotNum', '//span[@class="lot-num"]/text()')
        l.add_xpath('Lead', '//span[@class="lot-name"]/text()')
        l.add_xpath(
            'Description',
            'string(//div[contains(@class, "description-info-content")])')
        l.add_xpath(
            'Price',
            '//span[@id and contains(text(), "Lot closed - High bid:")]/span/text()'
        )
        l.add_value('Sale', l.get_collected_values('Price'))

        yield l.load_item()
Beispiel #17
0
 def get_news(self, response):
     l = ItemLoader(item=SpiderItem(), response=response)
     l.add_value(
         'title',
         response.xpath('//div[@class="article_title"]/text()').extract())
     l.add_value(
         'date',
         response.xpath('//div[@class="article_title1"]/text()').extract())
     r1 = r"\d{1,4}"
     date0 = re.compile(r1)
     date = ''.join(l.get_collected_values('date'))
     date1 = date0.findall(date)
     date1 = date1[0] + '-' + date1[1] + '-' + date1[2]
     l.replace_value('date', date1)
     l.add_value(
         'content',
         response.xpath('//div[@id="MyContent"]/p/span/text()').extract())
     l.add_value(
         'content',
         response.xpath(
             '//div[@id="MyContent"]/p/font/span/text()').extract())
     l.add_value(
         'content',
         response.xpath('//p[@class="MsoNormal"]/span/span/font/span/text()'
                        ).extract())
     l.add_value(
         'content',
         response.xpath(
             '//p[@class="MsoNormal"]/span/span/font/text()').extract())
     l.add_value(
         'content',
         response.xpath('//div[@class="article_intro"]/text()').extract())
     l.add_value(
         'content',
         response.xpath('//div[@id="MyContent"]/p/font/text()').extract())
     l.add_value(
         'content',
         response.xpath('//p[@id="MsoNormal"]/span/text()').extract())
     l.add_value('url', response.url)
     l.add_value('collection_name', self.name)
     l.add_value('website', self.website)
     return l.load_item()
Beispiel #18
0
 def get_news(self, response):
     l = ItemLoader(item=SpiderItem(), response=response)
     l.add_value(
         'title',
         response.xpath(
             '//div[@id="lbyright_xwxq_title"]/text()').extract())
     l.add_value(
         'date',
         response.xpath('//div[@id="lbyright_xwxq_xxx"]/text()').extract())
     r1 = r"\d{4}\-\d{1,2}\-\d{1,2}"
     date0 = re.compile(r1)
     date = ''.join(l.get_collected_values('date'))
     date1 = date0.findall(date)
     l.replace_value('date', date1[0])
     l.add_value(
         'content',
         response.xpath(
             '//div[@id="lbyright_xwxq_txt"]/p/span/text()').extract())
     l.add_value('url', response.url)
     l.add_value('collection_name', self.name)
     l.add_value('website', self.website)
     return l.load_item()
Beispiel #19
0
    def get_news(self,response):
        l = ItemLoader(item=SpiderItem(),response=response)
        l.add_value('title', response.xpath('//h2[@class="titleH2"]/text()').extract())
        l.add_value('title', response.xpath('//div[@class="Article-Left"]/h3/text()').extract())
        l.add_value('title', response.xpath('//div[@class="tit"]/h1/text()').extract())

        l.add_value('date',response.xpath('//div[@class="from"]/span/text()').extract())
        l.add_value('date',response.xpath('//div[@class="CopyFrom"]/text()').extract())
        l.add_value('date',response.xpath('//div[@class="auther-from"]/text()').extract())
        r1 = r"\d{4}\-\d{1,2}\-\d{1,2}\s\d{2}\:\d{2}\:\d{2}"
        date0 = re.compile(r1)
        date = ''.join(l.get_collected_values('date'))
        date1 = date0.findall(date)
        l.replace_value('date', date1[0])
        l.add_value('content',response.xpath('//div[@class="content"]/p/text()').extract())
        l.add_value('content',response.xpath('//div[@class="content"]/p/font/text()').extract())
        l.add_value('content',response.xpath('//div[@class="content"]/text()').extract())

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        l.add_value('website', self.website)
        return l.load_item()
Beispiel #20
0
    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@class="wh645 left"]/p[1]/text()').extract())
	    l.add_value('title', response.xpath('//p[@class="f22 lh30 yahei"]/a/text()').extract())
	    l.add_value('title', response.xpath('//p[@class="f22 lh40 fb"]/text()').extract())

            l.add_value('date',response.xpath('//p[@class="lh30 left f14 yahei"]/text()').extract())
	    l.add_value('date',response.xpath('//span[@id="pubtime_baidu"]/text()').extract())
	    l.add_value('title', response.xpath('//div[@class="top"]/p/text()').extract())

            date = ''.join(l.get_collected_values('date'))
            #date = time.strptime(date.split()[0], '%Y-%m-%d')
            #l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))

            l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/p/text()').extract())
            l.add_value('content',response.xpath('//div[@class="TRS_Editor"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="sanji_left"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="news-con"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="news-con"]/div/div/div/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/font/font/p/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
	    url = response.url

            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
Beispiel #21
0
    def parse_article(self, response):
        loader = ItemLoader(item=OfficialItem(), response=response)

        index = response.meta.get("index")
        title = response.meta.get('title', None)
        tags_list = response.meta.get('tags_list')
        block_type = ",".join(tags_list)

        # 文章中需要提取的信息,标题,详细时间,内容,作者,来源
        article = response.xpath("//div[@class='article']")
        if not title:
            loader.add_xpath("title", ".//h1[@class='arti-title']//text()")
        else:
            loader.add_value("title", title)
        article_metas = article.xpath(
            ".//p[@class='arti-metas']//span//text()").extract()
        loader.add_value("detail_time", article_metas[0])
        loader.add_value("author", article_metas[1], re='作者:(.*)')
        loader.add_value("block_type", block_type)
        loader.add_value("content", response.xpath("//div[@id='content']"))
        loader.add_xpath("img", "//div[@id='content']//@src")
        loader.add_value("article_url", response.url)
        loader.add_value("tags_list", tags_list)
        loader.add_value("index", index)

        imgs = loader.get_collected_values("img")
        if imgs:
            for img in imgs:
                if "http" in img:
                    yield Request(img,
                                  callback=self.parse_img,
                                  dont_filter=True,
                                  meta={
                                      "type": "image",
                                      "article_url": response.url
                                  })

        yield loader.load_item()
Beispiel #22
0
    def get_news(self,response):
	try:
	    l = ItemLoader(item=SpiderItem(),response=response)
            l.add_value('title', response.xpath('//div[@class="layout"]/h2/text()').extract())
	    l.add_value('title', response.xpath('//div[@id="wrapper"]/h1/text()').extract())
	    l.add_value('title', response.xpath('//div[@class="top"]/h1/text()').extract())

            l.add_value('date',response.xpath('//div[@class="layout"]/div/text()').extract())
	    l.add_value('date',response.xpath('//div[@class="left"]/span/text()').extract())
	    l.add_value('title', response.xpath('//div[@class="top"]/p/text()').extract())

            date = ''.join(l.get_collected_values('date'))
            date = time.strptime(date.split()[0], '%Y-%m-%d')
            l.replace_value('date', time.strftime('%Y-%m-%d %H:%M:%S', date))

            l.add_value('content',response.xpath('//div[@class="news-con"]/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="news-con"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@class="news-con"]/div/div/div/font/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/p/text()').extract())
	    l.add_value('content',response.xpath('//div[@id="news-con"]/div/font/font/p/text()').extract())

            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
	    url = response.url

            return l.load_item()
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' % (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01 00:00:00')
            l.add_value('source', '')
            l.add_value('content', '')
            l.add_value('url', response.url)
            l.add_value('collection_name', self.name)
            l.add_value('website', self.website)
            return l.load_item()
    def parse_info(self, response):
        loaderJob = ItemLoader(item=JobInfoItem(), response=response)
        loaderJob.add_value("url", value=response.url)
        loaderJob.add_value("job_category", value=urllib.unquote(response.meta["category"]))
        loaderJob.add_xpath("job_name", '//div[@class="title-info over"]/h1/text()')
        loaderJob.add_xpath("job_name", '//div[@class="title-info "]/h1/text()')
        loaderJob.add_xpath("job_company", '//div[@class="title-info over"]/h3/text()')
        loaderJob.add_xpath("job_company", '//div[@class="title-info "]/h3/text()')
        loaderJob.add_xpath("job_company", '//div[@class="title-info "]/h3/a/text()')
        loaderJob.add_xpath("job_miniEdu", '//div[@class="resume clearfix"]/span/text()', TakeNumL(0))
        loaderJob.add_xpath("job_experience", '//div[@class="resume clearfix"]/span/text()', TakeNumL(1))
        loaderJob.add_xpath("job_reqLan", '//div[@class="resume clearfix"]/span/text()', TakeNumL(2))
        loaderJob.add_xpath("job_reqAge", '//div[@class="resume clearfix"]/span/text()', TakeNumL(3))
        loaderJob.add_xpath("job_salary", '//p[@class="job-main-title"]/text()', TakeFirstL())
        loaderJob.add_xpath("job_location", '//p[@class="basic-infor"]/span[1]/text()', TakeFirstL())
        loaderJob.add_xpath("job_update", '//p[@class="basic-infor"]/span[2]/text()', TakeFirstL(), re=u"(?<=发布于:).*")
        loaderJob.add_xpath(
            "job_desc", '//div[@class="content content-word"][1]', RemoveTagsL(), StripBlankL(), JoinL("")
        )
        loaderJob.add_xpath(
            "job_benefits",
            '//div[@class="job-main main-message"]',
            RemoveTagsL(),
            ReplaceBlank(),
            re=u"(?<=薪酬福利:)[\s\S]*",
        )
        loaderJob.add_xpath("job_benefits", '//div[@class="tag-list clearfix"]/span/text()', JoinL("|"))
        yield loaderJob.load_item()

        if "job.liepin.com" in response.url:
            loaderCom = ItemLoader(item=ComInfoItem(), response=response)
            loaderCom.add_value("url", value=response.url)
            loaderCom.add_value("com_name", value=loaderJob.get_collected_values("job_company"))
            loaderCom.add_xpath(
                "com_industry",
                '//div[@class="right-post-top"]/div[@class="content content-word"]/a[1]/@title',
                TakeFirstL(),
            )
            loaderCom.add_xpath(
                "com_size",
                '//div[@class="right-post-top"]/div[@class="content content-word"]',
                RemoveTagsL(),
                re=u"(?<=规模:)[\s\S]*?(?=<br>)",
            )
            loaderCom.add_xpath(
                "com_nature",
                '//div[@class="right-post-top"]/div[@class="content content-word"]',
                RemoveTagsL(),
                re=u"(?<=性质:)[\s\S]*?(?=<br>)",
            )
            loaderCom.add_xpath(
                "com_address",
                '//div[@class="right-post-top"]/div[@class="content content-word"]',
                RemoveTagsL(),
                re=u"(?<=地址:)[\s\S]*",
            )
            loaderCom.add_xpath(
                "com_intro",
                '//div[@class="job-main main-message noborder "]/div[@class="content content-word"]/text()',
                StripBlankL(),
                TakeFirstL(),
            )
            yield loaderCom.load_item()
Beispiel #24
0
    def parse_post(self, response):
        '''
        parse post does multiple things:
            1) loads replied-to-comments page one-by-one (for DFS)
            2) call parse_reply on the nested comments
            3) adds simple (not-replied-to) comments
            4) follows to new comment page
        '''

        #load replied-to comments pages
        #select nested comment one-by-one matching with the index: response.meta['index']
        path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '[' + str(
            response.meta['index']) + ']'
        group_flag = response.meta[
            'group'] if 'group' in response.meta else None

        for reply in response.xpath(path):
            source = reply.xpath('.//h3/a/text()').extract()
            answer = reply.xpath(
                './/a[contains(@href,"repl")]/@href').extract()
            ans = response.urljoin(answer[::-1][0])
            self.logger.info('{} nested comment'.format(
                str(response.meta['index'])))
            yield scrapy.Request(ans,
                                 callback=self.parse_reply,
                                 priority=1000,
                                 meta={
                                     'reply_to': source,
                                     'url': response.url,
                                     'index': response.meta['index'],
                                     'flag': 'init',
                                     'group': group_flag
                                 })

        #load regular comments
        if not response.xpath(path):  #prevents from exec
            path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]'
            for i, reply in enumerate(response.xpath(path2)):
                self.logger.info('{} regular comment'.format(i + 1))
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                """ 
                PROFILE REACTIONS SECTION
                adds functionality for adding profile and specific reaction data
                gets the profile url, creates a new item
                if the profile exists, add info to new item and increment 'check'
                to signal that new information has been added to the item
                and it's already been yielded
                repeat this process for reactions
                """

                #profile = response.xpath(".//h3/a/@href")
                #profile = response.urljoin(profile[0].extract())

                profile = "https://mbasic.facebook.com" + new.get_collected_values(
                    'source_url')[0]
                #print('profile', profile)
                #print('new item', new.get_collected_values('name'))

                item = new.load_item()
                check = 0
                if profile:
                    check += 1
                    yield scrapy.Request(profile,
                                         callback=self.parse_profile,
                                         meta={'item': item})

                temp = ItemLoader(item=CommentsItem(), selector=reply)
                temp.context['lang'] = self.lang

                temp.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]/@href')
                reactions = temp.get_collected_values('reactions')
                if reactions:
                    check += 1
                    reactions = "https://mbasic.facebook.com" + temp.get_collected_values(
                        'reactions')[0]
                    temp = 0
                    yield scrapy.Request(reactions,
                                         callback=self.parse_reactions,
                                         meta={'item': item})

                if check == 0:
                    yield item

        #new comment page
        if not response.xpath(path):
            #for groups
            next_xpath = './/div[contains(@id,"see_next")]'
            prev_xpath = './/div[contains(@id,"see_prev")]'
            if not response.xpath(next_xpath) or group_flag == 1:
                for next_page in response.xpath(prev_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': 1
                                         })
            else:
                for next_page in response.xpath(next_xpath):
                    new_page = next_page.xpath('.//@href').extract()
                    new_page = response.urljoin(new_page[0])
                    self.logger.info(
                        'New page to be crawled {}'.format(new_page))
                    yield scrapy.Request(new_page,
                                         callback=self.parse_post,
                                         meta={
                                             'index': 1,
                                             'group': group_flag
                                         })
Beispiel #25
0
    def get_news(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        try:
            loader.add_value(
                "title",
                response.xpath(
                    '//h1[@id="articleTitle"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//div[@id="articleTitle"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//h2[@id="toptitle"]/a/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//div[@class="tit_dt"]/b/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//div[@id="ArticleTitle"]/text()').extract_first())
            loader.add_value(
                "title",
                response.xpath(
                    '//h1[@class="picContentHeading"]/text()').extract_first())

            date = response.xpath(
                '//span[@id="pubTime"]/text()').extract_first()
            if date:
                loader.add_value("date", date + ":00")
            loader.add_value(
                "date", ''.join(
                    response.xpath('//div[@id="ArticleSourceAuthor"]/text()').
                    extract()).strip()[:19])
            if loader.get_collected_values("date") == '':
                end = response.url.find('/content_')
                loader.add_value(
                    "date",
                    response.url[end - 10:end].replace('/', '-') + " 00:00:00")

            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@id="contentMain"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@style="padding:15px 15px;line-height:28px;"]/descendant-or-self::text()'
                    ).extract()))
            loader.add_value(
                "content", ''.join(
                    response.xpath('//div[@class="con_dt"]/descendant::text()'
                                   ).extract()))
            loader.add_value(
                "content", ''.join(
                    response.xpath(
                        '//div[@id="ArticleContent"]/descendant::text()').
                    extract()))

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('title', '')
            loader.add_value('date', '1970-01-01 00:00:00')
            loader.add_value('content', '')

        loader.add_value('url', response.url)
        loader.add_value('collection_name', self.name)
        loader.add_value('website', self.website)

        return loader.load_item()
Beispiel #26
0
    def parse_post(self, response):
        # Web elements to extract post
        thread_item = response.meta['thread_item']
        thread_loader = ItemLoader(item=thread_item)
        threadtitle = thread_loader.get_collected_values('threadtitle')

        for product in response.xpath("//div[contains(@class, 'post_block')]"):

            loader = ItemLoader(item=PostItem(), selector=product)
            loader.add_value('threadtitle', threadtitle)

            temp = product.css(
                "div.post_block div.post_wrap div.post_body").extract()
            temp = re.sub('<br>|<strong>|<\/strong>|<em>|<\/em>', ' ', temp[0])
            temp = re.sub('\n', ' ', temp)
            temp = re.sub('<blockquote(.*?)blockquote>', ' ', str(temp))

            selector = scrapy.Selector(text=str(temp))
            loader.add_value(
                "postcontent",
                selector.xpath(
                    "//div[contains(@class,'post_body')]/div[@itemprop='commentText'][1]"
                ).extract())

            loader.add_value(
                "authorname",
                product.css(
                    "div.post_wrap div.author_info div.user_details span[itemprop='name']::text"
                ).get(default='N/A'))
            loader.add_value(
                "authortype",
                product.css(
                    "div.post_wrap div.author_info div.user_details li.group_title::text"
                ).get())
            loader.add_value(
                "noposts",
                product.css(
                    "div.post_wrap div.author_info div.user_details li.post_count::text"
                ).get())
            if len(
                    product.css("div.post_wrap div.post_body div.signature").
                    getall()) > 0:
                loader.add_value(
                    "authorsign",
                    product.css(
                        "div.post_wrap div.post_body div.signature").getall())

            else:
                loader.add_value("authorsign", ['N/A'])

            loader.add_value(
                "date",
                product.css(
                    "div.post_wrap div.post_body p.posted_info abbr.published::text"
                ).get())

            yield loader.load_item()

        next_page = response.xpath(
            "//div[contains(@class, 'topic_controls')]/div[contains(@class, 'pagination')]/"
            "ul[contains(@class, 'forward')]/li[contains(@class, 'next')]/a/@href"
        ).extract_first()
        if next_page is not None:
            yield response.follow(next_page,
                                  callback=self.parse_post,
                                  meta={'thread_item': thread_item})  #
Beispiel #27
0
    def get_news(self, response):
        try:
            loader = ItemLoader(item=SpiderItem(), response=response)

            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="text"]/h1/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath('//div[@class="text_c clearfix"]/h1/text()').
                extract_first())
            loader.add_value(
                'title',
                response.xpath(
                    '//div[@class="text_c"]/h1/text()').extract_first())
            loader.add_value(
                'title',
                response.xpath('//div[@class="d2_left wb_left fl"]/h1/text()').
                extract_first())

            loader.add_value(
                'date',
                response.xpath(
                    '//p[@class="text_tools"]/text()').extract_first())
            loader.add_value(
                'date',
                response.xpath('////div[@class="text_c clearfix"]/h5/text()').
                extract_first())
            loader.add_value(
                'date',
                response.xpath('//p[@class="sou"]/text()').extract_first())
            loader.add_value(
                'date',
                response.xpath(
                    '//span[@id="p_publishtime"]/text()').extract_first())
            date = ''.join(loader.get_collected_values('date'))
            date = time.strptime(date.split()[0], '%Y年%m月%d日%H:%M')
            loader.replace_value('date', time.strftime('%Y-%m-%d', date))

            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="text_c"]/descendant-or-self::text()').
                    extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="text_show"]/descendant-or-self::text()'
                    ).extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@class="show_text"]/descendant-or-self::text()'
                    ).extract()))
            loader.add_value(
                'content', ''.join(
                    response.xpath(
                        '//div[@id="p_content"]/descendant-or-self::text()').
                    extract()))

        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            l = ItemLoader(item=SpiderItem(), response=response)
            l.add_value('title', '')
            l.add_value('date', '1970-01-01')
            l.add_value('source', '')
            l.add_value('content', '')

        l.add_value('url', response.url)
        l.add_value('collection_name', self.name)
        l.add_value('website', self.website)
        return l.load_item()
Beispiel #28
0
    def parse_reply(self, response):
        '''
        parse reply to comments, root comment is added if flag
        '''
        #        from scrapy.utils.response import open_in_browser
        #        open_in_browser(response)

        if response.meta['flag'] == 'init':
            #parse root comment
            for root in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=root)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_value('reply_to', 'ROOT')
                new.add_xpath('text', './/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                #response --> reply/root
                """
                PROFILE REACTIONS SECTION (REPEAT SEE LINE 176 )
                the only difference is that, when getting the item temporarily
                the selector is the root instead of the reply, (it matches the for loop)
                """
                #profile = response.xpath(".//h3/a/@href")
                #profile = response.urljoin(profile[0].extract())
                profile = "https://mbasic.facebook.com" + new.get_collected_values(
                    'source_url')[0]
                print('profile', profile)
                #print('new item', new.get_collected_values('name'))
                item = new.load_item()
                check = 0
                if profile:
                    check += 1
                    yield scrapy.Request(profile,
                                         callback=self.parse_profile,
                                         meta={'item': item})

                #reactions = new.get_value('reactions')
                #print("reactions",reactions)

                temp = ItemLoader(item=CommentsItem(), selector=root)
                temp.context['lang'] = self.lang

                temp.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]/@href')
                reactions = temp.get_collected_values('reactions')
                if reactions:
                    check += 1
                    reactions = "https://mbasic.facebook.com" + temp.get_collected_values(
                        'reactions')[0]
                    temp = 0
                    yield scrapy.Request(reactions,
                                         callback=self.parse_reactions,
                                         meta={'item': item})

                if check == 0:
                    yield item

            #parse all replies in the page
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                """
                PROFILE REACTIONS SECTION SECTION (REPEAT SEE LINE 176)
                """
                #profile = response.xpath(".//h3/a/@href")
                #profile = response.urljoin(profile[0].extract())
                profile = "https://mbasic.facebook.com" + new.get_collected_values(
                    'source_url')[0]

                #print('new item', new.get_collected_values('name'))
                item = new.load_item()
                check = 0
                if profile:
                    check += 1
                    yield scrapy.Request(profile,
                                         callback=self.parse_profile,
                                         meta={'item': item})

                temp = ItemLoader(item=CommentsItem(), selector=reply)
                temp.context['lang'] = self.lang

                temp.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]/@href')
                reactions = temp.get_collected_values('reactions')
                if reactions:
                    check += 1
                    reactions = "https://mbasic.facebook.com" + temp.get_collected_values(
                        'reactions')[0]
                    temp = 0
                    yield scrapy.Request(reactions,
                                         callback=self.parse_reactions,
                                         meta={'item': item})

                if check == 0:
                    yield item

            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            if back:
                self.logger.info('Back found, more nested comments')
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to proper page: {}'
                    .format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })

        elif response.meta['flag'] == 'back':
            """
            adds random time pauses to prevent blocking
            DOWNSIDE: the algorithm will go slower, but still
            runs pretty quickly
            the greater the length of time, the more 
            likely you'll go undetected, but if you're using a large amount 
            of data, this may be unreasonable
            """
            #print("did we make it")
            r = randrange(0, 20)
            time.sleep(r)
            #parse all comments
            for reply in response.xpath(
                    '//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'
            ):
                #print("reply")
                new = ItemLoader(item=CommentsItem(), selector=reply)
                new.context['lang'] = self.lang
                new.add_xpath('source', './/h3/a/text()')
                new.add_xpath('source_url', './/h3/a/@href')
                new.add_value('reply_to', response.meta['reply_to'])
                new.add_xpath('text', './/div[h3]/div[1]//text()')
                new.add_xpath('date', './/abbr/text()')
                new.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]//text()')
                new.add_value('url', response.url)
                """
                SECTION (REPEAT SEE LINE 176)
                """

                profile = "https://mbasic.facebook.com" + new.get_collected_values(
                    'source_url')[0]

                #profile = response.xpath(".//h3/a/@href")
                #profile = response.urljoin(profile[0].extract())
                #print('profile', profile)
                #print('new item', new.get_collected_values('name'))
                check = 0
                item = new.load_item()
                if profile:
                    check += 1
                    print(1)
                    yield scrapy.Request(profile,
                                         callback=self.parse_profile,
                                         meta={'item': item})

                #response --> reply/root
                #print("before ", item)
                temp = ItemLoader(item=CommentsItem(), selector=reply)
                temp.context['lang'] = self.lang

                temp.add_xpath(
                    'reactions',
                    './/a[contains(@href,"reaction/profile")]/@href')
                reactions = temp.get_collected_values('reactions')
                if reactions:
                    check += 1
                    reactions = "https://mbasic.facebook.com" + temp.get_collected_values(
                        'reactions')[0]
                    temp = 0
                    print(2)
                    yield scrapy.Request(reactions,
                                         callback=self.parse_reactions,
                                         meta={'item': item})

                if check == 0:
                    print(3)
                    yield item
                #print("after ", item)

            #keep going backwards
            back = response.xpath(
                '//div[contains(@id,"comment_replies_more_1")]/a/@href'
            ).extract()
            self.logger.info('Back found, more nested comments')
            if back:
                back_page = response.urljoin(back[0])
                yield scrapy.Request(back_page,
                                     callback=self.parse_reply,
                                     priority=1000,
                                     meta={
                                         'reply_to': response.meta['reply_to'],
                                         'flag': 'back',
                                         'url': response.meta['url'],
                                         'index': response.meta['index'],
                                         'group': response.meta['group']
                                     })

            else:
                next_reply = response.meta['url']
                self.logger.info(
                    'Nested comments crawl finished, heading to home page: {}'.
                    format(response.meta['url']))
                yield scrapy.Request(next_reply,
                                     callback=self.parse_post,
                                     meta={
                                         'index': response.meta['index'] + 1,
                                         'group': response.meta['group']
                                     })
Beispiel #29
0
 def getInfo(self, res):
     if not mch(res):
         return
     response = etree.HTML(res.text)
     loader = ItemLoader(item=booking.Booking(), response=res)
     supplier_obj_id = res.meta.get('statics.hotels.id')
     supplier_name = res.meta.get('statics.hotels.supplier')
     if supplier_obj_id:
         loader.add_value('statics_hotels_id', supplier_obj_id)
         loader.add_value('statics_hotels_supplier', supplier_name)
     pic = []
     for e in self.allXpath:
         Xpath = eval('bk.' + e)
         fielName, lable = '_'.join(e.split('_')[:-1]), e.split('_')[-1]
         tempResult = ''
         if lable == 'non':
             if response.xpath(Xpath):
                 tempResult = response.xpath(Xpath)[0].strip()
         elif lable == 'ren':
             if re.findall(Xpath, res.text):
                 tempResult = re.findall(Xpath, res.text)[0].strip()
         elif lable == 'rea':
             if re.findall(Xpath, res.text):
                 for each in re.findall(Xpath, res.text):
                     tempResult += each.strip()
         elif lable == 'sub':
             if response.xpath(Xpath):
                 tempResult = re.sub(
                     '\\n+', '\\n',
                     response.xpath(Xpath)[0].xpath('string(.)')).strip()
         elif lable == 'sua':
             selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split(
                 'weego')[1], Xpath.split('weego')[2:]
             for each in response.xpath(selects):
                 temp = each.xpath(subSelcets)
                 if isinstance(temp, list):
                     tempResult += temp[0]
                 elif isinstance(temp, str):
                     tempResult += temp
             tempResult = re.sub('\\n+', '\\n', tempResult).strip()
         elif lable == 'pic':
             selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split(
                 'weego')[1], Xpath.split('weego')[2:]
             for each in response.xpath(selects):
                 temp = each.xpath(subSelcets)
                 pic.append(temp[0])
             tempResult = pic
         elif lable == 'pir':
             for each in re.findall(Xpath, res.text):
                 pic.append(each)
             tempResult = pic
         elif lable == 'xpl':
             selects, subSelcets, y = Xpath.split('weego')[0], Xpath.split(
                 'weego')[1], Xpath.split('weego')[2:]
             tl = []
             for each in response.xpath(selects):
                 temp = re.sub('\\n+', ' - ',
                               each.xpath(subSelcets).strip())
                 tl.append(temp)
             loader.add_value(fielName.lower(), tl)
         if lable != 'xpl':
             if loader.get_collected_values(fielName.lower()):
                 if loader.get_collected_values(fielName.lower())[0] == '':
                     loader.replace_value(fielName.lower(), tempResult)
             else:
                 loader.add_value(fielName.lower(), tempResult)
     yield loader.load_item()
    def parse_course(self, response):
        l = ItemLoader(item=ConestogacCourseItem(), response=response)
        l.default_output_processor = TakeFirst()

        course_data = response.xpath('//div[@data-accordion][1]')

        l.add_value('institution_name', 'Conestoga College')
        l.add_xpath('course_code', '//div[@class="hero-banner"]//span/text()')
        l.add_xpath('course_name',
                    '//h1[contains(@class, "text-white")]/text()')
        l.add_value(
            'delivery_types',
            course_data.xpath(
                './/small[strong[contains(text(), "Delivery:")]]/following-sibling::small/text()'
            ).get())
        l.add_value('url', response.url)
        # l.add_value('faculty', '???????????')
        l.add_xpath(
            'description',
            '//h2[contains(text(), "Course description")]/following-sibling::p[1]/text()'
        )

        price = course_data.xpath(
            './/small[strong[contains(text(), "Cost:")]]/following-sibling::small/text()'
        ).get()
        if price:
            price = price.lstrip('$')
        else:
            price = '0.0'
        l.add_value('price', [price])

        weekday_time_data = course_data.xpath(
            './/small[strong[contains(text(), "Day/Time:")]]/following-sibling::small/text()'
        ).getall()
        if not weekday_time_data:
            return False
        weekday_time_data = [
            remove_garbage(data) for data in weekday_time_data
        ]
        # ['Thurs. 9:00am – 4:00pm', 'Fri. 9:00am – 4:00pm']
        weekday_time_data = [
            data for data in weekday_time_data if len(data) > 1
        ]

        if weekday_time_data:
            weekdays = [
                re.search(r'(^\w+)', d).group(1)
                if re.search(r'(^\w+)', d) else '' for d in weekday_time_data
            ]
            weekdays = [d for d in weekdays if d]
        else:
            weekdays = []

        l.add_value('days', [weekdays])
        l.add_value(
            'prerequisite',
            response.xpath(
                '//strong[contains(text(), "Prerequisites:")]/following-sibling::a/text()'
            ).getall())
        l.add_value(
            'corequisites',
            response.xpath(
                '//strong[contains(text(), "Corequisites:")]/following-sibling::a/text()'
            ).getall())
        l.add_value('program', 'Continuing Education')

        if weekday_time_data:
            duration_hours_list = [
                re.findall(r'\d{1,2}:\d{1,2}\w{2}', t)
                for t in weekday_time_data
            ]
        else:
            duration_hours_list = []
        l.add_value('duration_hours', duration_hours_list)
        l.add_value('duration_days_week', l.get_collected_values('days'))

        start_date = course_data.xpath(
            './/small[strong[contains(text(), "Start Date:")]]/following-sibling::small/text()'
        ).get()
        if start_date:
            start_date = re.sub(r'(\s*\.\s+|\s*,\s+)', '-', start_date)
            start_date = datetime.strptime(start_date, '%b-%d-%Y')

        end_date = course_data.xpath(
            './/small[strong[contains(text(), "End date:")]]/following-sibling::small/text()'
        ).get()
        if start_date:
            end_date = re.sub(r'(\s*\.\s+|\s*,\s+)', '-', end_date)
            end_date = datetime.strptime(end_date, '%b-%d-%Y')

        duration_month_list = [[start_date, end_date]]

        l.add_value('duration_months', duration_month_list)
        l.add_value('duration_as_string', [
            l.get_collected_values('duration_hours'),
            l.get_collected_values('duration_days_week'),
            l.get_collected_values('duration_months'),
        ])

        hours_site = course_data.xpath(
            './/small[strong[contains(text(), "Hours:")]]/following-sibling::small/text()'
        ).get()
        if not hours_site:
            hours_site = 0

        l.add_value('total_hours', [
            l.get_collected_values('duration_hours'),
            l.get_collected_values('duration_days_week'),
            hours_site,
        ])

        yield l.load_item()
    def parse_program(self, response):
        programs = response.xpath('//div[h2[@id]]')
        for program_block in programs:
            program = program_block.xpath('./h2/text()').get()

            program_block_html_string = program_block.get()
            program_block_html_string = re.sub(r'^\s*<div>\s*', '',
                                               program_block_html_string)
            program_block_html_string = re.sub(r'\s*</div>\s*$', '',
                                               program_block_html_string)

            courses = program_block_html_string.split('<hr class="modest">')
            courses = [el for el in courses if el]

            for course_html in courses:

                course = Selector(text=course_html)
                l = ItemLoader(item=CamosunCourseItem())
                # l.default_input_processor = MapCompose(lambda x: x.strip())
                l.default_output_processor = Join(' | ')

                course = course.xpath(
                    '//h3[@id and not(following-sibling::p[contains(@class, "alert-info")]) and not(following-sibling::del)]'
                )
                # If in block tere is no matching h3 element skip this element
                if not course:
                    continue

                l.add_value('institution_name', 'Camosun College')
                l.add_value('course_code', course.xpath('./@id').get())
                l.add_value('course_name', course.xpath('./text()').get())
                l.add_value('delivery_types', 'Onsite')
                l.add_value('url', response.url)
                l.add_value('faculty', response.meta['faculty'])
                l.add_value(
                    'description',
                    course.xpath('./following-sibling::p[1]//text()').getall())

                ul_blocks = course.xpath(
                    './following-sibling::ul[contains(string(), "$")]')

                # Skip course if no ul block with days and price
                if not ul_blocks:
                    continue

                ul_data = []
                dates_data = []
                for ul in ul_blocks:
                    # Parse weekdays and times
                    ul_string = remove_tags(ul.get())
                    ul_string = re.sub(r'\s{2,}', ' ', ul_string)
                    ul_string = remove_garbage(ul_string)
                    ul_string = ul_string.strip()
                    ul_data.append(ul_string)

                    # Parse dates text node
                    date_string = ul.xpath(
                        './preceding-sibling::text()[1]').get('')
                    date_string = remove_garbage(date_string)
                    # 1s check get we dates or just catch the bullets
                    if len(date_string) < 5:
                        date_string = ul.xpath(
                            '(./preceding-sibling::text()[2])').get('')
                        date_string = remove_garbage(date_string)
                    # Remove garbage till 2019
                    re_search = re.search(r'^(.+)2019', date_string)
                    if re_search:
                        remove_pattern = re.escape(re_search.group(1))
                        date_string = re.sub(remove_pattern, '', date_string)
                    # Write to list of dates only string that contains 2019
                    if '2019' in date_string:
                        dates_data.append(date_string.strip())

                prices = [
                    re.search(r'\$(\d+)', p).group(1) if re.search(
                        r'\$(\d+)', p) else '0.0' for p in ul_data if p
                ]
                l.add_value('price', prices)
                # l.add_value('subject', ul_data)

                # Get strings weekdays
                # Remove string not containing time
                weekdays = [
                    wd if re.search(r'\d+:\d+\w{2}', wd) else ''
                    for wd in ul_data if wd
                ]
                # Get string with weekday
                weekdays = [
                    re.search(r'^[^\d]+', wd).group() if re.search(
                        r'^[^\d]+', wd) else [] for wd in weekdays if wd
                ]
                # Clear from bullets at the end of string
                weekdays = [re.sub(r'\W+$', '', i) for i in weekdays if i]
                # Clear from empty string after above clearing
                weekdays = [wd.split(' ') for wd in weekdays if wd]
                l.add_value('days', weekdays)
                l.add_value('program', program)

                # Get time in gropu like DD:DDam-DD:DDam
                duration_hours = [
                    re.findall(r'(\d+:\d+\w{2}-\d+:\d+\w{2})', tm)
                    for tm in ul_data if tm
                ]
                # Plepare list for time like [['6:30pm', '9:30pm'], ['8:30am', '4:30pm']]
                # duration_hours = [tm[0].split('-') for tm in duration_hours if tm]
                duration_hours_list = []
                for tm in duration_hours:
                    if not tm:
                        continue
                    if len(tm) > 1:
                        for interval in tm:
                            duration_hours_list.append(interval.split('-'))
                    else:
                        duration_hours_list.append(tm[0].split('-'))

                l.add_value('duration_hours', duration_hours_list)
                l.add_value('duration_days_week',
                            l.get_collected_values('days'))

                # Looking for month interval
                duration_month_list = []
                dur_month_tpl = '{year} {month}'
                for mon in dates_data:
                    if not mon:
                        continue
                    mon_res = re.search(
                        r'(2019).+(\w{3} \d+) - (\w{3} \d+)?|(2019).+(\w{3} \d+)',
                        mon)
                    if not mon_res:
                        continue
                    year, start_m, end_m, one_year, one_m = mon_res.groups()

                    if one_m:
                        m_start = dur_month_tpl.format(year=one_year,
                                                       month=one_m)
                        m_end = dur_month_tpl.format(year=one_year,
                                                     month=one_m)
                    else:
                        m_start = dur_month_tpl.format(year=year,
                                                       month=start_m)
                        m_end = dur_month_tpl.format(year=year, month=end_m)

                    duration_month_list.append([m_start, m_end])

                l.add_value('duration_months', duration_month_list)
                l.add_value('duration_as_string', [
                    l.get_collected_values('duration_hours'),
                    l.get_collected_values('duration_days_week'),
                    l.get_collected_values('duration_months'),
                ])

                l.add_value('total_hours', [
                    l.get_collected_values('duration_hours'),
                    l.get_collected_values('duration_days_week'),
                ])

                # l.add_value('corequisites', dates_data)

                yield l.load_item()