Esempio n. 1
0
class XinhuaNewsSpider(CrawlSpider):
    name = "xinhua_news_spider"
    start_urls = ['http://www.xinhuanet.com/']

    allowed_domains = ['xinhuanet.com']
    # http://news.xinhuanet.com/fortune/2017-11/10/c_1121937779.htm
    url_pattern = r'http://news.xinhuanet.com/([a-z]+)/*/2017-(\d{1,2})/(\d{1,2})/c\_(\d{6,10}).htm'
    rules = [
        Rule(LxmlLinkExtractor(allow=[url_pattern]),
             callback='parse_news',
             follow=True)
    ]

    def parse_news(self, response):
        sel = Selector(response)
        title = sel.xpath('//div[@class="h-title"]/text()').extract()

        pattern = re.match(self.url_pattern, str(response.url))
        source = 'xinhuanet.com'

        date = sel.xpath('//div[@class="h-info"]/span/text()').extract()

        time = sel.xpath('//div[@class="h-info"]/span/text()').extract()
        url = response.url

        newsId = re.findall(r'c_(.*?).htm', url, re.S)[0]
        contents = ListCombiner(
            sel.xpath('//div[@id="p-detail"]/p/text()').extract())
        # comments= sel.xpath('//div[@class="right"]/span'

        comments = 0
        item = NewsItem()
        item['source'] = source
        item['time'] = time
        item['date'] = date
        item['contents'] = contents
        item['title'] = title
        item['url'] = url
        item['newsId'] = newsId
        item['comments'] = comments
        yield item
Esempio n. 2
0
    def parse(self, response):
        for link in LxmlLinkExtractor(
                allow=self.allowed_domains).extract_links(response):
            # print(link.url)
            yield scrapy.Request(link.url, self.parse)

        # extracting descriptiona and meta from site
        soup = BeautifulSoup(response.text, "lxml")

        title = soup.title.string
        og_type = soup.find("meta", property="og:type")
        og_site_name = soup.find("meta", property="og:site_name")
        og_image = soup.find("meta", property="og:image")
        og_title = soup.find("meta", property="og:title")
        og_url = soup.find("meta", property="og:url")
        raw_text = soup.get_text()

        og_type = og_type.get("content", None) if og_type else None
        og_site_name = og_site_name.get("content",
                                        None) if og_site_name else None
        og_image = og_image.get("content", None) if og_image else None
        og_title = og_title.get("content", None) if og_title else None
        og_url = og_url.get("content", None) if og_url else None

        collection = self.db.pages

        # update or insert
        collection.update({"url": response.url}, {
            "$set": {
                "url": response.url,
                "domain": self.allowed_domains[0],
                "title": title,
                "og_type": og_type,
                "og_site_name": og_site_name,
                "og_image": og_image,
                "og_title": og_title,
                "og_url": og_url,
                "raw_text": raw_text
            }
        },
                          upsert=True)
Esempio n. 3
0
class MushroomWorldSpider(CrawlSpider):
    name = "mushroom_world_spider"
    start_urls = ["http://www.mushroom.world/mushrooms/namelist"]

    rules = (
        Rule(LxmlLinkExtractor(
            restrict_xpaths=("//div[@class='item']")),
            follow=True,
            callback='parse_item'
        ),
      )

    def parse_item(self, response):
        name = response.css(".caption b ::text").extract_first().strip()
        description = response.css(".longtextus ::text").extract_first()
        family = response.css(".textus ::text").extract_first()
        location = response.xpath(".//textus[1]/text()").extract()
        dimensions = response.css("#mushroom-list:nth-child(2) ::text").extract()
        edibility = response.css("#mushroom-list:nth-child(3) ::text").extract()
        yield MushroomWorldItem(name=name, description=description, family=family,
                                    location=location, dimensions=dimensions, edibility=edibility)
Esempio n. 4
0
 def parse(self, response):
     ##response only from body
     # html = requests.get(response,timeout=40)
     # bs = BeautifulSoup(html.text)
     # body = bs.find("body")
     # print(response.url,len(response.body))
     # responseb = HtmlResponse(url=html.url,body=str(body),encoding="utf-8")
     responseb = HtmlResponse(url=response.url, body=response.body)
     linkObjs = LxmlLinkExtractor().extract_links(responseb)
     ##include(?):re.search("(([.]css|;)$|javascript|mailto:|tel:)",i) is None
     ##keep links orbisweb (not optimal yet!)
     ##include %s/ ?
     pattern = "([.]%s|%s[.])" % (self.allow, self.allow)
     links = [
         l.url for l in linkObjs if re.search(pattern, l.url) is not None
     ]
     links = list(set(links))
     self.nlinks = len(links)
     for l in links:
         #self.successParse(l)
         yield self.successParse(l)
Esempio n. 5
0
    def parse(self, response):
    	# print("%s : %s : %s" % (response.status, response.url, response.text) )

        # print title text with css and xpath selectors
	    title_text = response.css('title::text')
	    print(title_text.get())
	    title_text = response.xpath('//title[1]/text()') 
	    print(title_text.get())

        # Get all anchor tags with css and xpath selectors
	    css_links = response.css('a::attr(href)').getall()
	    xpath_links = response.xpath('//a/@href').getall()
	    print(len(css_links))
	    print(len(xpath_links))
	    for (link, xlink) in zip(css_links, xpath_links):
        	print('{} {} '.format(link, xlink))

        # fetch url from github and avoid social media sites
	    trending_links = LxmlLinkExtractor(allow= r'^https://[a-z.]+/[a-z.]+$', deny_domains=['shop.github.com','youtube.com','twitter.com'], unique = True).extract_links(response)
	    for link in trending_links:
	    	print("%s : %s " % (link.url, link.text))
Esempio n. 6
0
    def parse(self, response: Response) -> Iterator[Request]:
        self.log(response)

        if getattr(self, 'validate_html', False):
            yield Request(
                'http://127.0.0.1:9988/?out=json',
                method='POST',
                headers={'Content-Type': response.headers['Content-Type']},
                body=response.body,
                callback=self._vnu_callback(response.url),
                errback=self.error_callback,
            )

        for link in LxmlLinkExtractor(
                deny_domains=self.deny_domains,
                deny_extensions=['doc'],
                tags=self.tags,
                attrs=self.attrs,
                deny=self.deny,
                canonicalize=False).extract_links(response):
            yield from self._make_requests(link.url)
Esempio n. 7
0
class BaseSpider(scrapy.Spider):
    name = "base"
    link_extractor = LxmlLinkExtractor(allow=(),
                                       deny=(),
                                       allow_domains=(["example.com"]),
                                       deny_domains=(),
                                       deny_extensions=None,
                                       restrict_xpaths=(),
                                       restrict_css=(),
                                       tags=('a', 'area'),
                                       attrs=('href', ),
                                       canonicalize=False,
                                       unique=True,
                                       process_value=None,
                                       strip=True)

    def start_requests(self):
        base_url = 'http://example.com/'

        yield scrapy.Request(url=base_url, callback=self.parse)

    def parse(self, response):

        links = self.link_extractor.extract_links(response)
        links_processing_start = time.time()
        for link in links:
            yield {'source_url': response.url, 'destination_url': link.url}
            yield scrapy.Request(url=link.url, callback=self.parse)
        print("TOTAL TIME TOOK | " +
              str(time.time() - links_processing_start) +
              " | TOTAL LINKS COUNT | " + str(len(links)))
        yield {
            'url': response.url,
            'download_time': 0,
            'download_latency': response.meta['download_latency']
        }
        os._exit(1)

    def parseError(self, response):
        print('Error')
Esempio n. 8
0
class ZhihuSpider(CrawlSpider):
    name = 'zhihu2'
    allowed_domains = ['zhuanlan.zhihu.com']
    start_urls = [
        'https://zhuanlan.zhihu.com/bankk',
    ]
    rules = (
        Rule(LinkExtractor(allow=('https://zhuanlan.zhihu.com/(\w+)*$', )),
             callback='parse_item',
             follow=True),
        Rule(LinkExtractor(allow=('https://zhuanlan.zhihu.com/p/(\d+)*$', )),
             callback='parse_item',
             follow=True),
        Rule(LxmlLinkExtractor(allow=('/p/(\d+)*$', ),
                               tags=('a', ),
                               attrs=('href', ),
                               process_value='add_links'),
             callback='parse_item',
             follow=True),
    )

    # 这里使用了pipline来进行存储item的工作
    # 当需要多使用不同的pipline时,删除setting中的pipline,在各个spider中自己定义pipline,或者在pipline中进行逻辑判断
    # custom_settings = {
    #     'ITEM_PIPELINES': {
    #         'myscrapytest.pipelines.ZhihuPipeline': 400
    #     }
    # }
    def parse_item(self, response):
        soup = BeautifulSoup(response.text, 'lxml')
        title = soup.title.string
        item = ZhihuItem()
        item['title'] = title
        item['url'] = response.url
        yield item
        # self.log('Saved file %s' % filename)

    def add_links(self, value):
        return 'https://zhuanlan.zhihu.com' + value
Esempio n. 9
0
class ImdbSpider(CrawlSpider):
    name = 'imdb'
    allowed_domains = ['www.imdb.cn']
    rules = [
        Rule(LxmlLinkExtractor(allow=r'/title/tt\d+$'),
             callback='parse_imdb',
             follow=True)
    ]

    def start_requests(self):
        pages = []
        for i in range(1, 14616):
            url = "http://www.imdb.cn/nowplaying/" + str(i)
            yield Request(url=url, callback=self.parse)

    def parse_imdb(self, response):
        item = CrawldemoItem()
        item['url'] = response.url
        item['title'] = ''.join(
            response.xpath(
                '//*[@class="fk-3"]/div[@class="hdd"]/h3/text()').extract())
        yield item
Esempio n. 10
0
    def parse_page(self, response):
        """
        General page parser
        :param response: 
        :return: 
        """
        links_visit = set()
        links = set()
        for link in LxmlLinkExtractor(allow=(),
                                      deny=()).extract_links(response):
            links.add(link.url)

        logger.info('Current url: %s' % response.url)
        logger.info('Current resp: %s' % response)

        # Search result - container element
        lists = response.xpath(
            '//div[@id="primary"]//div[@class="listWidget"]')
        for list_widget in lists:
            logger.debug('List widget: %s' % list_widget)
            eapp = list_widget.xpath('div[@class="appRow"]')
            einfo = list_widget.xpath('div[@class="infoSlide"]')

            if len(eapp) == 0:
                logger.warning('No results')
                return

            for eapp1 in eapp:
                logger.debug(eapp1)

                #ahref = eapp1.xpath('div/div/div/h5/a')[0]
                #link = ahref.attrib['href']
                #title = ahref.xpath('text()')
                #logger.debug('Title / link %s %s ' % (title, link))

        logger.debug('Extracted %s links from %s' %
                     (len(links_visit), response.url))
        for link in list(links_visit):
            pass
class EuropythonSpyder(CrawlSpider):
    def __init__(self, year='', *args, **kwargs):
        super(EuropythonSpyder, self).__init__(*args, **kwargs)
        self.year = year
        self.start_urls = [
            'http://ep' + str(self.year) + ".europython.eu/en/events/sessions"
        ]
        print('start url: ' + str(self.start_urls[0]))

    name = "europython_spyder"

    allowed_domains = [
        "ep2015.europython.eu", "ep2016.europython.eu", "ep2017.europython.eu",
        "ep2018.europython.eu"
    ]

    # Pattern for entries that match the conference/talks format
    rules = [
        Rule(LxmlLinkExtractor(allow=['conference/talks']),
             callback='process_response')
    ]

    def process_response(self, response):
        item = EuropythonItem()
        print(response)
        item['title'] = response.xpath(
            "//div[contains(@class, 'grid-100')]//h1/text()").extract()
        item['author'] = response.xpath(
            "//div[contains(@class, 'talk-speakers')]//a[1]/text()").extract()
        item['description'] = response.xpath(
            "//div[contains(@class, 'cms')]//p//text()").extract()
        item['date'] = response.xpath(
            "//section[contains(@class, 'talk when')]/strong/text()").extract(
            )
        item['tags'] = response.xpath(
            "//div[contains(@class, 'all-tags')]/span/text()").extract()

        return item
Esempio n. 12
0
class Comics(CrawlSpider):
    ''''scrapy spider.

    inherit from CrawlSpider.
    '''

    name = "comics"
    allowed_domains = ["www.tazhe.com"]
    start_urls = ["http://www.tazhe.com/mh/"]
    rules = [
        Rule(LxmlLinkExtractor(allow=(r'http://www.tazhe.com/mh/\d+')),
             callback="parse_item"),
    ]

    def __init__(self, *args, **kwargs):
        super(Comics, self).__init__(*args, **kwargs)

    def parse_item(self, response):
        '''rewrite class method.
        '''
        #from scrapy.shell import inspect_response
        #inspect_response(response, self)
        sel = response.selector
        item = ComicsItem()
        item['name'] = sel.xpath(
            '//*[@id="intro_l"]/div[1]/h1/text()').extract()
        item['author'] = sel.xpath(
            '//*[@id="intro_l"]/div[2]/p[2]/text()').extract()
        item['update_time'] = sel.xpath(
            '//*[@id="intro_l"]/div[2]/p[1]/span/text()').extract()
        item['last_update'] = sel.xpath(
            '//*[@id="intro_l"]/div[1]/span/font/text()').extract()
        item['classification'] = sel.xpath(
            '//*[@id="intro_l"]/div[2]/p[5]/a/text()').extract()
        item['introduction'] = sel.xpath(
            '//*[@id="intro1"]/p/text()[1]').extract()
        item['url'] = response.url
        return item
Esempio n. 13
0
 def parse_obj(self, response):
     item = MyItem()
     item['url'] = []
     for link in LxmlLinkExtractor(
             allow=(),
             deny=(),
             deny_extensions=None,
             tags=('a', 'area', 'q', 'meta', 'track', 'object', 'style',
                   'video', 'applet', 'body', 'button', 'del', 'head',
                   'html', 'input', 'ins', 'img', 'source', 'base',
                   'blockquote', 'embed', 'form', 'frame', 'iframe', 'link',
                   'script'),
             attrs=('href', 'src', 'data', 'archive', 'codebase', 'poster',
                    'code', 'cite', 'background', 'formaction', 'profile',
                    'xmlns', 'ping', 'longdesc', 'srcset', 'action',
                    'srcdoc', 'scheme'),
             process_value=None,
             unique=True).extract_links(response):
         is_allowed = False
         is_regex_output = False
         for allowed_domain in self.allowed_domains:
             if re.match("^https?:\/\/" + allowed_domain,
                         link.url) is not None:
                 is_allowed = True
         if re.match("^https?:\/\/" + self.regex_output,
                     link.url) is not None:
             is_regex_output = True
         if is_allowed:
             item['url'].append(link.url)
         if is_regex_output:
             z = open("re-match-urls.txt", "a")
             z.write(link.url + "\n")
             z.close()
         else:
             f = open("other-urls.txt", "a")
             f.write(link.url + "\n")
             f.close()
     return item
Esempio n. 14
0
    def parse(self, response):
        logger.info('jobdiva|url in parse %s', response.url)
        self.crawler.stats.inc_value('completed_url', 1)
        self.crawler.stats.set_value('spider', 'jobdiva')
        response_value = -2
        temp = {'urls': []}
        tags = ['span', 'td']
        item = parse_fields(self.crawl_request, response, response_value, tags)
        iframe_url = response.css('iframe::attr(src)').extract()

        for url in iframe_url:
            for allowed_domain in self.allowed_domains:
                response_value = url.find(allowed_domain)
                if response_value >= 0:
                    yield scrapy.Request(url=url, callback=self.parse)
        if len(item) is not 0:
            yield item
        for link in LxmlLinkExtractor(
                allow_domains=self.allowed_domains).extract_links(response):

            url = response.urljoin(link.url)
            temp['urls'].append(url)
            yield scrapy.Request(url=url, callback=self.parse)
Esempio n. 15
0
    def parse_obj(self, response):
        """
        Base parsing routine - pure link extractor
        :param response:
        :return:
        """
        links_visit = set()
        links = set()
        for link in LxmlLinkExtractor(allow=(),
                                      deny=()).extract_links(response):
            links.add(link.url)

            # Another filter if desired
            if self.should_follow_link(link.url, response):
                links_visit.add(link.url)

        for d in list(links):
            item = LinkItem()
            item['url'] = d
            yield item

        for d in list(links_visit):
            yield Request(d)
Esempio n. 16
0
 def parse(self, response: Any) -> Generator[Request, None, None]:
     self.log(response)
     for link in LxmlLinkExtractor(
             deny_domains=self.deny_domains,
             deny_extensions=['doc'],
             tags=self.tags,
             attrs=self.attrs,
             deny=self.deny,
             canonicalize=False).extract_links(response):
         callback = self.parse  # type: Any
         dont_filter = False
         method = 'GET'
         if self._is_external_url(link.url):
             callback = self.check_existing
             method = 'HEAD'
         elif '#' in link.url:
             dont_filter = True
             callback = self.check_permalink
         yield Request(link.url,
                       method=method,
                       callback=callback,
                       dont_filter=dont_filter,
                       errback=self.error_callback)
Esempio n. 17
0
    def parse(self, response):
        parsed_uri = urlparse(response.url)
        domainurl = '{uri.netloc}'.format(uri=parsed_uri)

        # If the amount of downloaded pages of one site exceeds the limit, all following requests of the same domain will be removed from the queue
        if int(job_redis.hlen(domainurl)) > self.maximumPagesPerSite:
            regex = re.compile(r'\b'+domainurl+'\b')
            if len(filter(lambda i: regex.search(i), self.start_urls))>0:
                for item in filter(lambda i: regex.search(i), self.start_urls):
                    self.start_urls.remove(item)
            return

        # Remove urls containing anchor mark, phone numbers, emails and login pages
        for link in LxmlLinkExtractor(deny=[r'[\S\s]*#[\S\s]*',r'[\S\s]*\/tel:[\S\s]*',r'[\S\s]*\/fax:[\S\s]*',r'[\S\s]*\/mailto:[\S\s]*',r'[\S\s]*\/login[\S\s]*',r'[\S\s]*\/\+[0-9]*$'],allow_domains=self.allow_domains).extract_links(response):
            if int(job_redis.hlen(domainurl)) > self.maximumPagesPerSite:
                break
            else:
                self.start_urls.append(link.url)

        # Add sites having respond code from 400 to 600 to a list
        if response.status in range(400, 600):
            job_redis.sadd('error',response.url)
        else:
            item=StandaloneItem()
            tempinput=response.xpath("//body")
            
            #Extract the domain, title ,text and url of a website
            if tempinput:
                templist=[]
                templist.append(re.sub(r'\s+', ' ',tempinput.extract()[0].strip()))
                item['domain']=[domainurl]
                item['data'] = templist
                item['title']=response.xpath("normalize-space(//title)").extract()
                item['link']=[response.url]
                return item
            else:
                job_redis.sadd('error',response.url)
Esempio n. 18
0
class DaneSpider(CrawlSpider):
    formatter = LinksFormatter()
    name = "Dane"
    domain = "funes.uniandes.edu.co"
    allowed_domains = [domain]
    start_urls = ('http://' + domain, )

    rules = [
        Rule(LxmlLinkExtractor(allow=(),
                               allow_domains=domain,
                               process_value=formatter.formatLink),
             'parsePages',
             follow=True)
    ]

    def parsePages(self, response):
        linkExtractor = LxmlLinkExtractor(
            deny_extensions=[], process_value=self.formatter.formatLink)
        item = ScraperdaneItem()
        item["name"] = response.url
        item["children"] = [
            link.url for link in linkExtractor.extract_links(response)
        ]
        return item
Esempio n. 19
0
class QuotesSpider(CrawlSpider):
    name = "wiki"
    allowed_domains = ["en.wikipedia.org"]
    deny = [
        '#', 'index.php', 'Wikipedia:', 'Portal:', 'Special:', 'Help:',
        'Talk:', 'File:', 'User:'******'Template:', 'Category:', '/Main_Page'
    ]
    start_urls = [
        'https://en.wikipedia.org',
    ]

    rules = (Rule(LxmlLinkExtractor(allow_domains=allowed_domains, deny=deny),
                  callback='parse_obj',
                  follow=True), )

    def parse_obj(self, response):
        item = MyItem()
        item['url'] = []

        for link in LxmlLinkExtractor(allow_domains=self.allowed_domains,
                                      deny=self.deny).extract_links(response):
            item['url'].append(link.url.rstrip('/'))

        yield {'url': response.url.rstrip('/'), 'items': item['url']}
Esempio n. 20
0
class JxcbwNewsSpider(CrawlSpider):
    name = "jxcbw_news_spider"
    allowed_domains = ['jxcbw.cn']
    start_urls = ['http://www.jxcbw.cn/mainpages/default.aspx']
    #http://www.jxcbw.cn/mainpages/NewsInfo.aspx?NewsID=69366&NewsType=LE123
    #http://www.jxcbw.cn/mainpages/NewsInfo.aspx?NewsID=70152&NewsType=LE107
    url_pattern = r'http://www.jxcbw.cn/[a-z]+/NewsInfo.aspx?(NewsID=\d{3,8}&NewsType=LE\d{1,7})'
    rules = [
            Rule(LxmlLinkExtractor(allow=[url_pattern]), callback='parse_news', follow=True)
         ]
    def parse_news(self, response):
        sel = Selector(response)
        pattern = re.match(self.url_pattern, str(response.url))
        source = 'www.jxcbw.cn'
        time = sel.xpath('//span[@class="time fl"]/text()').extract()
        date = time[0]
        title = sel.xpath('//h2[@class="title-class2"]/text()').extract()
        newsId = pattern.group(1)
        url = response.url
        # if sel.xpath('//div[@id="content"]/div/text()'):
        #     contents = ListCombiner(sel.xpath('//div[@id="content"]/div/text()').extract())
        # else:
        #     contents = "unknown"
        comments= 0
        contents = 0

        item = NewsItem()
        item['source'] = source
        item['title'] = title
        item['date'] = date
        item['time'] = time
        item['newsId'] = newsId
        item['url'] = url
        item['contents'] = contents
        item['comments'] = comments
        yield item
Esempio n. 21
0
class A80sSpider(CrawlSpider):
    name = '80s'
    allowed_domains = ['www.80s.tw']

    def start_requests(self):
        for page in range(1, self.settings.get('MAX_PAGE') + 1):
            url = 'https://www.80s.tw/movie/list/-----p/{page}'.format(
                page=page)
            yield scrapy.Request(url=url, )

    rules = (Rule(LxmlLinkExtractor(
        allow='/movie/\d+',
        restrict_css=
        '#block3 > div.clearfix.noborder.block1 > ul.me1.clearfix > li > a'),
                  callback='parse_detail'), )

    def parse_detail(self, response):
        item = Movie1Item()
        item['href'] = response.css(
            '#myform > ul > li.clearfix.dlurlelement.backcolor1 > span.dlname.nm > span > a::attr(href)'
        ).extract_first()
        item['title'] = response.css(
            '#minfo > div.info > h1::text').extract_first()
        yield item
Esempio n. 22
0
class axxelerate_spider(CrawlSpider):
    name = 'axxelerate'
    allowed_domains = ['en.wikipedia.org']
    start_urls = ['https://en.wikipedia.org/wiki/Main_Page']
    rules = (Rule(LxmlLinkExtractor(allow=(allowed_domains)), callback='parse_obj', follow=True),)

    def parse_obj(self,response):
        item = url_item()
        item['url'] = response.url
        item['keywords'] = []
        tags = ["h1", "title", "article", "div", "blockquote", "td", "li", "p", "span", "strong", "b", "i"]
        for tag in tags:
            texts = response.xpath("//%s/text()" % (tag)).extract()
            for text in texts:
                text =  text.encode("latin1", "ignore")
                result = modify_query.query(text)
                item['keywords'] = item['keywords'] + result
        item['title'] = response.xpath("//title/text()").extract_first()
        item['keywords'] = set(item['keywords'])
        item['linksTo'] = []
        for link in LxmlLinkExtractor(allow=(),deny = ()).extract_links(response):
            if link.url.startswith('https://en.wikipedia.org'):
                item['linksTo'].append(link.url)
        return item
Esempio n. 23
0
 def parseLinks(self, response):
     links = LxmlLinkExtractor(allow=('http://www.bentleyhomes.com.au/properties/[\w-]+/$')).extract_links(response)
     for link in links:
         yield Request(link.url, callback=self.parseItem, meta=response.meta)
Esempio n. 24
0
class BentleyhomesSpider(CrawlSpider):

    name = 'bentleyhomes'
    allowed_domains = ['www.bentleyhomes.com.au']
    start_urls = ['http://www.bentleyhomes.com.au/']
    rules = (
        Rule(LxmlLinkExtractor(allow=('http://www.bentleyhomes.com.au/home-designs/search-home-designs/$')),
             follow=True, callback='parseForm'),
        Rule(LxmlLinkExtractor(allow=('http://www.bentleyhomes.com.au/house-and-land/browse-our-hl-packages/$')),
             follow=True, callback='parseList'),
        Rule(LxmlLinkExtractor(allow=('http://www.bentleyhomes.com.au/displays-homes/ex-display-homes-for-sale/$')),
             follow=True),

    )
    oth = ('Games','Studio','Games Room','Leisure','Rumpus','Rooms','Grand Living','Bedroom 5',
           'Living','Retreat','M.P.R')
    logo = 'Bentley Homes'


    def parseForm(self,response):
        if response.url.find('home-designs') != -1:
            callback = self.parseLinks
        elif response.url.find('browse-our-hl-packages') != -1:
            callback = self.parseList

        for i in range(1,4):
            formdata = {'storeys_filter': str(i),
                        'submit': 'Search'}
            yield FormRequest(response.url,
                            formdata=formdata,
                            meta={'Storey':str(i)},
                            callback=callback)

    def parseLinks(self, response):
        links = LxmlLinkExtractor(allow=('http://www.bentleyhomes.com.au/properties/[\w-]+/$')).extract_links(response)
        for link in links:
            yield Request(link.url, callback=self.parseItem, meta=response.meta)

    def parseList(self,response):
        referer = response.request.headers.get('Referer', None).decode("utf-8")
        hxs = HtmlXPathSelector(response)
        hxsItemsList = hxs.xpath('//div[@class="property-item"]')
        for hxsItem in hxsItemsList:
            l = RealtyLoader(RealtyspidersItem(), hxsItem)
            l.add_value('url', response.url)
            l.add_value('BuildType', 'Browse our H&L packages')
            l.add_value('BuilderLogo', self.logo)
            l.add_xpath('Lot_BlockAddress', './/span[@class="street"]/text()')
            l.add_xpath('Squares', './/span[@class="area"]/text()')
            l.add_xpath('Bedrooms', '//li[@class="beds"]/text()')
            l.add_xpath('Bathrooms', '//li[@class="baths"]/text()')
            l.add_xpath('Garage', '//li[@class="garages"]/text()')
            l.add_xpath('LivingArea', '//li[@class="storeys"]/text()')
            l.add_xpath('BasePrice',
                    './/div[@class="field-prefix" and text()="$"]/following-sibling::div[@class="field-value"]/text()')
            l.add_xpath('HomeDesignMainImage', './/img/@src')
            yield l.load_item()

    def parseItem(self, response):
        referer = response.request.headers.get('Referer', None).decode("utf-8")
        hxs = HtmlXPathSelector(response)
        # with open('testURL', 'a') as file:
        #     file.write(str(response.meta)+ '\n')
        #     file.writelines('\n'.join(hxs.xpath('//div[@class="col-md-8"]/table/tbody/tr/td[1]/text()').extract()))
        roomsXpath = '''//div[@class="room_dimensions overview_table"]
                        //tr/td[text()="Master Bedroom"]/following-sibling::td/text()'''
        overviewXpath = '''//table[@id="hf-property-overview"]/tr/td/div[text()="{}"]/ancestor::td/following-sibling::
                            td[@class="item-value"]/div/div[@class="field-value"]/text()'''
        imgXpath = '//div[@class=" flexslider_gallery image hf-property-gallery"]/div/ul/li[{}]/img/@src'
        descriptionXPath = '//div[@id="col-md-8"]/p/text()'
        # data = hxs.xpath(roomsXpath).extract()
        # with open('testURL','a') as file:
        #     for i in data:
        #         file.write(i+'\n')
        other = []
        for name in self.oth:
            size = hxs.xpath(roomsXpath.format(name)).extract_first()
            if size:
                other.append('{}:{}'.format(name, size))

        l = RealtyLoader(RealtyspidersItem(), hxs)
        l.add_value('url', response.url)
        l.add_value('BuildType', self._getBuildType(referer))
        l.add_value('BuilderLogo', self.logo)
        l.add_xpath('DesignName', '//h3[@class="title-post"]/text()')
        l.add_value('State', 'MELBOURNE')
        l.add_xpath('Squares', '//div[@class="info-box1 "]/p[1]/text()')
        l.add_xpath('Bedrooms', '//li[@class="beds"]/text()')
        l.add_xpath('Bathrooms', '//li[@class="baths"]/text()')
        l.add_xpath('Garage', '//li[@class="garages"]/text()')
        l.add_xpath('BasePrice',
                    '//div[@class="field-prefix" and text()="$"]/following-sibling::div[@class="field-value"]/text()')

        l.add_value('Storey', self._getStorey(response.meta['Storey']))

        l.add_xpath('HouseWidth', '//div[text()="MIN. BLOCK WIDTH"]/text()[2]')
        l.add_xpath('HouseLength', '//div[text()="\n                        MIN. BLOCK LENGTH"]/text()[2]')
        l.add_xpath('BrochureImage_pdf', '//a[text()="Brochure"]/@href')
        l.add_xpath('InclusionsImage_pdf', '//a[text()="Inclusions"]/@href')
        l.add_xpath('FloorPlanImage1', '//a[@class="floor-plan fancybox"]/img/@src')
        l.add_xpath('HomeDesignMainImage', imgXpath.format('1'))
        l.add_xpath('Image1', imgXpath.format('2'))
        l.add_xpath('Image2', imgXpath.format('3'))
        l.add_xpath('Image3', imgXpath.format('4'))
        l.add_xpath('Image4', imgXpath.format('5'))
        l.add_xpath('Image5', imgXpath.format('6'))
        l.add_xpath('Image6', imgXpath.format('7'))
        l.add_xpath('Image7', imgXpath.format('8'))
        l.add_xpath('Image8', imgXpath.format('9'))
        l.add_xpath('Image9', imgXpath.format('10'))
        l.add_xpath('Image10', imgXpath.format('11'))
        l.add_xpath('Image11', imgXpath.format('12'))
        l.add_xpath('Image12', imgXpath.format('13'))
        l.add_xpath('Image13', imgXpath.format('14'))
        l.add_xpath('Image14', imgXpath.format('15'))
        l.add_xpath('Image15', imgXpath.format('16'))




        l.add_xpath('MasterBedroomDimension', roomsXpath.format('Master Bedroom'))
        l.add_xpath('Bedroom2Dimension', roomsXpath.format('Bedroom 2'))
        l.add_xpath('Bedroom3Dimension', roomsXpath.format('Bedroom 3'))
        l.add_xpath('Bedroom4Dimension', roomsXpath.format('Bedroom 4'))
        l.add_xpath('StudyDimension', [roomsXpath.format('Study'),roomsXpath.format('Study nook')])
        l.add_xpath('Meals_DiningDimension', roomsXpath.format('Meals'))
        l.add_xpath('FamilyDimension', roomsXpath.format('Family'))
        l.add_xpath('AlfrescoDimension', roomsXpath.format('Alfresco'))
        l.add_xpath('LoungeDimension', roomsXpath.format('Lounge'))
        l.add_xpath('TheatreDimension', roomsXpath.format('Theatre'))
        l.add_value('OtherInclusions', ', '.join(other))

        # Block Yes No
        l.add_xpath('TheatreRoom_Yes_No',
                    roomsXpath.format('Theatre'))
        l.add_xpath('SeparateMeals_Yes_No',
                    roomsXpath.format('Meals'))
        l.add_xpath('Alfresco_Yes_No',
                    roomsXpath.format('Alfresco'))
        l.add_xpath('Study_Yes_No',
                    [roomsXpath.format('Study Nook'),roomsXpath.format('Study')])
        l.add_xpath('WalkinPantry_Yes_No',
                    descriptionXPath, **{'re': '([Ww]alkin|[Pp]antry)'})
        l.add_xpath('BultersPantry_Yes_No',
                    descriptionXPath, **{'re': '[Bb]ulter[`]?s?'})
        l.add_xpath('SteelStructure_Yes_No',
                    descriptionXPath, **{'re': '([Ss]teel.*[Ss]tructure)|([Ss]tructure.*[Ss]teel)'})
        l.add_xpath('Balcony_Yes_No',
                    roomsXpath.format('Balcony'))
        #
        # Гарантія
        l.add_xpath('SturturalWarranty',
                    descriptionXPath, **{'re': '.*guarantee.*|.*[Ww]arranty.*'})
        # Вікна
        l.add_xpath('Windows',
                    descriptionXPath, **{'re': '.*[Ww]indows?.*'})
        # Кухонна плита
        l.add_xpath('KitchenBenchtop',
                    descriptionXPath, **{'re': '.*[Kk]itchen.*[Bb]enchtop.*|.*[Bb]enchtop.*[Kk]itchen.*'})
        # Сигналізація
        l.add_xpath('SecuritySystem',
                    descriptionXPath, **{'re': '.*[Ss]ecurity.*[sS]ystem.*}.*[sS]ystem.*[Ss]ecurity.*'})
        # Клас енергозбереження
        l.add_xpath('EnergyRating',
                    descriptionXPath, **{'re': '.*[Ee]nergy.*[rR]ating.*|.*[rR]ating.*[Ee]nergy.*'})
        # Кухонне приладдя
        l.add_xpath('KitchenAppliance',
                    descriptionXPath, **{'re': '.*([Kk]itchen.*[Aa]ppliance).*|.*([Aa]ppliance.*[Kk]itchen).*'})
        # Бренд пристрою
        l.add_xpath('ApplianceBrand',
                    descriptionXPath, **{'re': '.*[\w\s]+[Ss]ecurity System.*'})
        # Kахель над умивальної раковиною
        l.add_xpath('Splashback',
                    descriptionXPath, **{'re': '.*[Ss]plashback.*'})
        # Покриття підлоги
        l.add_xpath('FloorCovering',
                    descriptionXPath, **{'re': '.*[Ff]loor.*[Cc]overings?.*|.*[Cc]overings?.*[Ff]loor.*'})
        # Охолодження
        l.add_xpath('Cooling',
                    descriptionXPath, **{'re': '.*[Cc]ooling.*'})
        # Ванна
        l.add_xpath('Bath',
                    descriptionXPath, **{'re': '.*[Ss]ecurity.*[Ss]ystem.*'})
        # Висота стели
        l.add_xpath('CeilingHeight',
                    descriptionXPath, **{'re': '.*[Bb]ath.*'})
        # Плитка в ванній
        l.add_xpath('EnsuiteWallTiling',
                    descriptionXPath, **{'re': '.*[Tt]ile.*'})
        # Плита в ванній
        l.add_xpath('EnsuiteBenchtop',
                    descriptionXPath, **{'re': '.*[Ee]nsuite.*[Bb]enchtop.*|.*[Bb]enchtop.*[Ee]nsuite.*'})
        # Душова
        l.add_xpath('EnsuiteShowerbase',
                    descriptionXPath, **{'re': '.*[Ss]howerbase.*'})
        # Фарба на стінах
        l.add_xpath('WallPaint',
                    descriptionXPath, **{'re': '.*[Ww]all.*[Pp]aint.*|.*[Pp]aint.*[Ww]all.*'})
        # Гардероб
        l.add_xpath('WIRFitouts',
                    descriptionXPath, **{'re': '.*walk in robe.*|.*WIR.*'})
        # Світильники
        l.add_xpath('Downlights',
                    descriptionXPath, **{'re': '.*[Dd]ownlights.*'})
        # Ландшафтний дизайн
        l.add_xpath('Landscaping',
                    descriptionXPath, **{'re': '.*[Ll]andscaping.*'})
        # Дорожка до дому
        l.add_xpath('Driveway',
                    descriptionXPath, **{'re': '.*[Dd]riveway.*'})
        # Реклама
        l.add_xpath('Promotion',
                    descriptionXPath, **{'re': '.*[Pp]romotion.*'})
        # # # інші штуки
        # # l.add_xpath('OtherInclusions',
        # #             descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'})
        # # l.add_xpath('OtherInclusions1',
        # #             descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'})
        # # l.add_xpath('OtherInclusions2',
        # #             descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'})
        # # l.add_xpath('OtherInclusions3',
        # #             descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'})
        # # l.add_xpath('OtherInclusions4',
        # #             descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'})
        # # l.add_xpath('OtherInclusions5',
        # #             descriptionXPath, **{'re': '[\w\s]+[Ss]ecurity System'})
        return l.load_item()



    def _getBuildType(self, url):

        if url.find('dual-occupancy') != -1:
            return 'Dual Occupancy'
        elif url.find('ex-display-homes-for-sale') != -1:
            return 'Display Homes for Sale'
        elif url.find('view-displays-homes') != -1:
            return 'Display Homes'
        elif url.find('completed-homes') != -1:
            return 'Completed Homes for Sale'
        elif url.find('search-home-designs') != -1:
            return 'Home Designs'
        elif url.find('browse-our-hl-packages') != -1:
            return 'H&L packages'
            # elif url.find('ex-display-homes-for-sale') != -1:
            #     return 'Display Homes for Sale'

    def _getStorey(self, data):
        if data == '1':
            return 'Single'
        elif data == '2':
            return 'Double'
        elif data == '3':
            return 'Split livel'
Esempio n. 25
0
class DarkWebSpider(CrawlSpider):

    name = 'darkWebBot'

    allowed_domains = ["onion"]
    start_urls = [
        "https://ahmia.fi/address/"  # "https://ahmia.fi/address/" #"http://check.torproject.org/"
    ]

    rules = (Rule(LxmlLinkExtractor(allow=()),
                  callback="parse_item",
                  follow=True), )

    def parse_item(self, response):

        # #i = response.xpath('//h1/@class').extract()[0]
        # #i['name'] = response.xpath('//div[@id="name"]').extract()
        # #i['description'] = response.xpath('//div[@id="description"]').extract()
        # f = open("/Users/laveeshrohra/Documents/Workspace/checkPolipo.txt", "w+")
        # f.write("class = %s" % (response.body))
        # f.close()

        hxs = HtmlXPathSelector(response)
        item = CrawledWebsiteItem()
        item['url'] = response.url
        item['server_header'] = str(response.headers)
        title_list = hxs.xpath('//title/text()').extract()
        h1_list = hxs.xpath("//h1/text()").extract()
        item['h1'] = " ".join(h1_list)
        h2_list = hxs.xpath("//h2/text()").extract()
        item['h2'] = " ".join(h2_list)
        title = ' '.join(title_list)
        item['title'] = title
        body_text = self.html2string(response)
        words = self.extract_words(body_text)
        item['text'] = title + " " + " ".join(words)
        return item

    def detect_encoding(self, response):
        return response.headers.encoding or "utf-8"

    def html2string(self, response):
        """HTML 2 string converter. Returns a string."""
        converter = html2text.HTML2Text()
        converter.ignore_links = True
        encoding = self.detect_encoding(response)
        decoded_html = response.body.decode(encoding, 'ignore')
        string = converter.handle(decoded_html)
        return string

    def extract_words(self, html_string):
        """Stems and counts the words. Works only in English!"""
        string_list = re.split(r' |\n|#|\*', html_string)
        # Cut a word list that is larger than 10000 words
        if len(string_list) > 10000:
            string_list = string_list[0:10000]
        words = []
        for word in string_list:
            # Word must be longer than 0 letter
            # And shorter than 45
            # The longest word in a major English dictionary is
            # Pneumonoultramicroscopicsilicovolcanoconiosis (45 letters)
            if len(word) > 0 and len(word) <= 45:
                words.append(word)
        return words
Esempio n. 26
0
class FoneArenaSpider(AlaCrawlSpider):
    name = 'fonearena'
    allowed_domains = ['fonearena.com']
    start_urls = ['http://www.fonearena.com/reviews.php']

    rules = [Rule(LxmlLinkExtractor(restrict_xpaths='//figure[@class="effect3"]/a',
                                    unique=True),
                  callback="parse_review"),
             Rule(LxmlLinkExtractor(restrict_xpaths='//a[@title="next page"]',
                                    unique=True))
             ]

    def parse_review(self, response):

        if not response.url.endswith(".php"):
            product = ProductItem()
            review = ReviewItem()

            review['TestTitle'] = self.extract(response.xpath('//h2/text()'))

            if review['TestTitle']:
                matches = re.search("^(.*?) review", review['TestTitle'], re.IGNORECASE)
                if matches:
                    review['ProductName'] = matches.group(1)
                    product['ProductName'] = matches.group(1)
                else:
                    review['ProductName'] = review['TestTitle']
                    product['ProductName'] = review['TestTitle']

            review['Author'] = self.extract(response.xpath('//a[@rel="author"]/text()'))

            date_span = self.extract(response.xpath('//span[@class="updated"]/text()'))
            if date_span:
                matches = re.search(r'(\S+ \d+, \d+) ', date_span)
                if matches:
                    date_span = matches.group(1)
                    review['TestDateText'] = date_format(date_span, '%B %d, %Y')

            product['PicURL'] = self.extract(response.xpath('//div[contains(@class,"entry")]/p//img/@src'))

            review['TestSummary'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/p[1]//text()'), separator=" ")
            if not review['TestSummary']:
                review['TestSummary'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/p[2]//text()'), separator=" ")

            review['TestVerdict'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/p[strong[contains(text(),"Conclusion")]]/following-sibling::p/text()'), separator=" ")
            if not review['TestVerdict']:
                review['TestVerdict'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/h2[contains(text(),"Conclusion")]/following-sibling::p/text()'), separator=" ")

            review['TestPros'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/p[strong[contains(text(),"Pros")]]/following-sibling::*[1]/li/text()'), separator="; ")
            if not review['TestPros']:
                review['TestPros'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/h3[contains(text(),"Pros")]/following-sibling::*[1]/li/text()'), separator="; ")

            review['TestCons'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/p[strong[contains(text(),"Cons")]]/following-sibling::*[1]/li/text()'), separator="; ")
            if not review['TestCons']:
                review['TestCons'] = self.extract_all(response.xpath('//div[contains(@class,"entry")]/h3[contains(text(),"Cons")]/following-sibling::*[1]/li/text()'), separator="; ")

            product['OriginalCategoryName'] = "Miscellaneous"
            review['DBaseCategoryName'] = "PRO"

            product['TestUrl'] = response.url
            review['TestUrl'] = response.url

            yield product
            yield review
Esempio n. 27
0
class imdb_spider(CrawlSpider):

    settings = get_project_settings()

    name = "imdb_spider"
    allowed_domains = ['imdb.com']

    start_urls = ['https://www.imdb.com/search/title?release_date=1980-01-01,']
    deny_urls = ['']

    with open(settings.get('DENIED_DOMAINS')) as f:
        content = f.readlines()

    no_domains = [x.strip() for x in content]

    no_ext = ['']
    tags = [
        'a', 'area', 'audio', 'embed', 'iframe', 'img', 'input', 'script',
        'source', 'track', 'video', 'form'
    ]
    # attrs = ['href', 'src', 'action']
    attrs = ['href']

    people_links = {}
    detail_fields = [
        "Taglines:", "Country:", "Language:", "Budget:",
        "Cumulative Worldwide Gross:", "Production Co:"
    ]
    director_fields = ["Director:", "Writers:"]

    movie_link = r'/title/\w+/\?ref_=adv_li_tt'
    nextpage_link = r'/search/title\?release_date=1980-01-01,&start=\d+&ref_=adv_nxt'

    rules = (
        Rule(LxmlLinkExtractor(allow=movie_link),
             callback='parse_movie',
             follow=False),
        Rule(LxmlLinkExtractor(allow=nextpage_link),
             callback='parse_nextpage',
             follow=True),
    )

    def parse_nextpage(self, response):
        print("[  PAGE  ]  {}".format(response.request.url))

    def parse_movie(self, response):

        # logger.info(">>>>> Movie: {}".format(response.request.url))
        print("[  MOVIE  ]  {}".format(response.request.url))

        # inputs

        movie_id = response.request.url.split('/')[4]
        title = ''.join(
            list(
                filter(
                    lambda x: x in string.printable,
                    response.xpath('//div[@class="title_wrapper"]/h1/text()').
                    extract_first().strip())))
        film_rating = response.xpath(
            '//div[@class="subtext"]/text()').extract_first()
        duration = response.xpath(
            '//div[@class="subtext"]/time/text()').extract_first()
        genre = ''.join(
            list(
                map(
                    str.strip,
                    str(
                        response.xpath(
                            '//div[@class="subtext"]/a[not(@title="See more release dates")]/text()'
                        ).extract()))))
        release_date = response.xpath(
            '//div[@class="subtext"]/a[@title="See more release dates"]/text()'
        ).extract_first()
        imdb_ratingValue = response.xpath(
            '//span[@itemprop="ratingValue"]/text()').extract_first()
        imdb_bestRating = response.xpath(
            '//span[@itemprop="bestRating"]/text()').extract_first()
        imdb_ratingCount = response.xpath(
            '//span[@itemprop="ratingCount"]/text()').extract_first()

        description = response.xpath(
            '//div[@class="summary_text"]/text()').extract_first()
        storyline = response.xpath(
            '//div[@id="titleStoryLine"]/div/p/span/text()').extract_first()

        lables = response.xpath(
            '//div[contains(@class, "plot_summary")]/div[@class="credit_summary_item"]/h4/text()'
        ).extract()
        credits = dict.fromkeys(['director', 'creator', 'writer', 'stars'])
        k = 0
        for x in lables:
            persons = response.xpath(
                '//div[contains(@class, "plot_summary")]/div[' + str(k) +
                '][@class="credit_summary_item"]/a/text()').extract()

            if 'See full cast & crew' in persons:
                persons.remove('See full cast & crew')

            # remove comments between brakets or parenthesis
            persons = [
                re.sub("[\(\[].*?[\)\]]", "", p).strip() for p in persons
            ]

            # director(s), creator(s), writer(s), stars
            if 'director' in x.lower():
                credits['director'] = persons
            if 'creator' in x.lower():
                credits['creator'] = persons
            if 'writer' in x.lower():
                credits['writer'] = persons
            if 'star' in x.lower():
                credits['stars'] = persons

            k += 1

        taglines = ''.join(
            response.xpath(
                '//div[@id="titleStoryLine"]/div[@class="txt-block"]/text()').
            extract()).strip()
        url = response.request.url

        poster = response.xpath(
            '//div[@class="poster"]//a/img/@src').extract_first()

        trailer_img = response.xpath(
            '//div[@class="slate"]//a/img/@src').extract_first()
        req_headers = self.headers_format(response.request.headers)
        res_headers = self.headers_format(response.headers)

        # Cleaning inputs

        if not movie_id or not title:
            return

        # convert released_date unicode into string
        film_rating = film_rating.encode('ascii', 'ignore')
        film_rating = film_rating.strip(
        ) if film_rating and type(film_rating) is str else ''

        # convert released_date unicode into string
        release_date = release_date.encode('ascii', 'ignore')
        release_date = release_date.strip(
        ) if release_date and type(release_date) is str else ''

        # if it's a movie, it will be in "11, MARCH 2013 (USA)"format
        # split string into time only without country name, then convert into datetime and unix time
        if release_date[0].isdigit() == True:
            release_date_unix_time = parser.parse(release_date.split("(")[0])
            release_date_unix_time = time.mktime(
                release_date_unix_time.timetuple())
        # if it's a TV series, it will be in "TV SERIES (2013 - ?)"format
        # split string into only 4 digit year, then convert into datetime and unix time
        if release_date.split("(")[1][0:4].isdigit():
            release_date_unix_time = parser.parse(
                "1, Jan " + release_date.split("(")[1][0:4])
            release_date_unix_time = time.mktime(
                release_date_unix_time.timetuple())

        # convert duration unicode into string
        if (duration is not None):
            duration = duration.encode('ascii', 'ignore')
            duration = duration.strip(
            ) if duration and type(duration) is str else ''
            # duration is in "1h 40min" format, split int out from string into array ["1","40"]
            hour_min = (re.findall(r'\d+', duration))
            # if hour_min array has 2 elements, then first element will be hour and second will be min
            if (len(hour_min) == 2):
                duration = (int(hour_min[0]) * 60 + int(hour_min[1]))
            # if hour_min array has 1 elements, then it could be minute or hour
            if (len(hour_min) == 1):
                # if hour_min has hour element like ["3h"], then last char would be h
                if (duration[-1:] == "h"):
                    duration = (int(hour_min[0]) * 60)
                # else it would be min
                else:
                    duration = (int(hour_min[0]))

        imdb_ratingValue = self.input2num(imdb_ratingValue)
        imdb_ratingCount = self.input2num(imdb_ratingCount)
        imdb_bestRating = self.input2num(imdb_bestRating)

        # convert description unicode into string
        description = description.encode('ascii', 'ignore')
        description = description.strip(
        ) if description and type(description) is str else ''

        # convert storyline unicode into string
        storyline = storyline.encode('ascii', 'ignore')
        storyline = storyline.strip(
        ) if storyline and type(storyline) is str else ''

        # Output
        item = ImdbScraperItem()

        item['movie_id'] = movie_id
        item['title'] = title
        item['film_rating'] = film_rating
        item['poster'] = poster
        item['trailer_img'] = trailer_img
        item['duration'] = duration
        item['genre'] = genre
        item['release_date'] = release_date
        item['imdb_ratingValue'] = imdb_ratingValue
        item['imdb_bestRating'] = imdb_bestRating
        item['imdb_ratingCount'] = imdb_ratingCount
        item['description'] = description
        item['release_date_unix_time'] = release_date_unix_time
        item['storyline'] = storyline
        item['director'] = credits.get('director', '')
        item['writer'] = credits.get('writer', '')
        item['creator'] = credits.get('creator', '')
        item['stars'] = credits.get('stars', '')
        item['taglines'] = taglines
        item['url'] = url
        item['req_headers'] = req_headers
        item['res_headers'] = res_headers

        yield item

    def input2num(self, iput):

        regnum = re.compile("^(?=.*?\d)\d*[.,]?\d*$")
        if iput:
            if iput.isdigit():
                return float(iput)

            oput = iput.replace(",", "")
            if regnum.match(oput):
                return float(oput)
        return -1

    def headers_format(self, header):
        hdr = {}
        for key, value in header.items():
            if isinstance(key, (bytes, bytearray)):
                hdr[key.decode('utf-8')] = b''.join(value).decode('utf-8')
            else:
                hdr[key] = ''.join(value)

        return json.dumps(hdr, ensure_ascii=False)
Esempio n. 28
0
        ignoreDomains = [domain.strip() for domain in g.readlines()]
        g.close()
        if len(ignoreDomains) == 0:
            print "ignoreDomains.txt empty. No domains to be ignored initially."

        for domain in ignoreDomains:
            dcount.set_ignored_domain(domain)

    except IOError, e:
        print "No ignoreDomains.txt found. No domains to be ignored initially."

    # The spider follows this rule for each group of links encountered on a page
    rules = (Rule(LxmlLinkExtractor(allow=[r'.+\.(com|org|net|).*'],
                                    deny=[
                                        r'.+\.(jpg|png|pdf|mp4|mp3|zip| \
                        torrent|mov|gif|txt|csv|webm|epub)'
                                    ],
                                    deny_domains=dcount.get_ignored_domains(),
                                    unique=True),
                  callback='parse_item',
                  process_links='process_links',
                  follow=True), )

    def process_links(self, links):
        """
        Called for each list of links collected by the spider.
        Discards those links which have domains in ignoreDomains.

        :param links: A list of scraped Link objects collected by the spider
        :return: a list of Link objects from "good" domains.
        """
Esempio n. 29
0
class MopSpider(CrawlSpider):
    name = 'people_bbs'

    start_urls = ['http://bbs1.people.com.cn/']

    post_extract = LxmlLinkExtractor(
        allow=(
            '/post/',
        ),
        allow_domains=(
            'bbs1.people.com.cn'
        ),
        # deny=(
        #
        # ),
        # deny_domains=(
        #
        # )
    )

    author_extract = LxmlLinkExtractor(
        allow=(
            '/userInfo\.do\?',
        ),
        allow_domains=(
            'bbs1.people.com.cn',
        ),
        deny=(
            '/userInfo\.do\?action=thread',
            '/userInfo\.do\?action=follow',
            '/userInfo\.do\?action=jinghua',
            '/userInfo\.do\?orderBy=',
        ),
        # deny_domains=(
        #
        # )
    )

    follow_extract = LxmlLinkExtractor(
        # allow=(
        #     '/s/[0-9]+',
        # ),
        allow_domains=(
            'bbs1.people.com.cn',
        ),
        deny=(
            '/userInfo\.do\?action=thread',
            '/userInfo\.do\?action=follow',
            '/userInfo\.do\?action=jinghua',
            '/userInfo\.do\?orderBy=',
        ),
        # deny_domains=(
        #     'q.blog.sina.com.cn'
        # )
    )

    rules = (
        Rule(author_extract, follow=True, callback='parse_author'),
        Rule(post_extract, follow=True, callback='parse_post'),
        # Rule(follow_extract, follow=True, callback='parse_follow'),
        Rule(follow_extract, follow=True, process_request=),
    )
    #
    # a_count = 0
    # p_count = 0
    # f_count = 0

    def parse_author(self, response):
        # self.a_count += 1
        # print('author: ', self.a_count, '  ', response.url)

        author_item = get_author_item(response)

        if author_item:

            yield author_item

    def parse_post(self, response):
        # self.p_count += 1
        # print('post: ', self.p_count, '  ', response.url)

        post_item = get_post_item(response)

        content_href = post_item['content_href']

        if content_href:
            yield Request(
                url=content_href,
                callback=self.parse_content,
                meta={
                    'post_item': post_item
                }
            )
        else:
            pass

    def parse_content(self, response):
        post_item = response.meta['post_item']

        content, picture_hrefs = get_post_content(response)

        post_item['content'] = content
        post_item['picture_hrefs'] = picture_hrefs

        for comment_item in get_comment_list(response):
            post_item['comment_ids'].append(comment_item['comment_id'])

            yield comment_item

        yield post_item
class PharmnetCrawlSpider(CrawlSpider):
    """医药网pharmnet.com.cn"""
    name = 'pharmnet'
    allowed_domains = ['pharmnet.com.cn']
    start_urls = [
        'http://news.pharmnet.com.cn/news/hyyw/news/index0.html',
        # 'http://news.pharmnet.com.cn/news/hyyw/news/index1.html',
    ]

    rules = (
        # LxmlLinkExtractor提取链接列表
        Rule(LxmlLinkExtractor(allow=(r'/news/\d{4}/\d{2}/\d{2}/\d+\.html',
                                      r'/news/hyyw/news/index\d+\.html'),
                               restrict_xpaths=('//div[@class="list"]',
                                                '//div[@class="page"]')),
             callback='parse_links',
             follow=False), )

    def parse_links(self, response):
        # 如果是首页文章链接,直接处理
        if '/hyyw/' not in response.url:
            yield self.parse_page(response)
        else:
            self.log('-------------------> link_list url=%s' % response.url,
                     logging.INFO)
            links = response.xpath('//div[@class="list"]/ul/li/p/a')
            for link in links:
                url = link.xpath('@href').extract()[0]
                yield Request(url=url, callback=self.parse_page)

    def parse_page(self, response):
        try:
            self.log('-------------------> link_page url=%s' % response.url,
                     logging.INFO)
            item = NewsItem()
            item['crawlkey'] = self.name
            item['category'] = ltos(
                response.xpath(
                    '//div[@class="current"]/a[last()]/text()').extract())
            item['link'] = response.url
            head_line = ltos(
                response.xpath('//div[@class="ct01"]/text()[1]').extract())
            item['location'] = head_line.strip().split()[1]
            item['pubdate'] = datetime.strptime(head_line.strip().split()[0],
                                                '%Y-%m-%d')
            item['title'] = ltos(response.xpath('//h1/text()').extract())
            content_temp = "".join([
                tt.strip() for tt in response.xpath(
                    '//div[@class="ct02"]/font/div/div|//div[@class="ct02"]/font/div'
                ).extract()
            ])
            item['content'] = filter_tags(content_temp)
            hc = ltos(response.xpath('//div[@class="ct02"]').extract())
            htmlcontent = clean_html(hc)
            # 特殊构造,不作为分组
            # (?=...)之后的字符串需要匹配表达式才能成功匹配
            # (?<=...)之前的字符串需要匹配表达式才能成功匹配
            pat_img = re.compile(r'(<img (?:.|\n)*?src=")((.|\n)*?)(?=")')
            uuids = []
            for i, m in enumerate(pat_img.finditer(htmlcontent)):
                full_path = m.group(2)
                suffix_name = '.' + os.path.basename(full_path).split('.')[-1]
                uuid_name = '{0:02d}{1:s}'.format(
                    i + 1,
                    uuid.uuid4().hex) + suffix_name
                uuids.append(uuid_name)
                self.log('UUID_PIC--------%s' % setting.URL_PREFIX + uuid_name,
                         logging.INFO)
                with contextlib.closing(urllib2.urlopen(full_path)) as f:
                    with open(os.path.join(IMAGES_STORE, uuid_name),
                              'wb') as bfile:
                        bfile.write(f.read())
            for indx, val in enumerate(uuids):
                htmlcontent = pat_img.sub(
                    Nth(indx + 1, setting.URL_PREFIX + val), htmlcontent)
            item['htmlcontent'] = htmlcontent
            self.log(
                '+++++++++title=%s+++++++++' % item['title'].encode('utf-8'),
                logging.INFO)
            return item
        except:
            self.log('ERROR-----%s' % response.url, logging.ERROR)
            raise DropItem('DropItem-----%s' % response.url)