Example #1
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name' : "//div[@class='title']/h2",
    'price' : "//p[@class='pridis']/strong",
    'category' : "",
    'description' : "//div[@class='detail']",
    'images' : "//div [@class='sllist']/ul/li/a/img/@src",
    'canonical' : "",
    'base_url' : "",
    'brand' : ""
}
name = 'necdeal.vn'
allowed_domains = ['necdeal.vn']
start_urls = ['http://necdeal.vn']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow = ['/deal/[a-z0-9-]+/[a-z0-9-]+-[a-z0-9]+'], deny = ['.*/$']), 'parse_item'),
    Rule(LinkExtractor(deny = ['.*']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Example #2
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//h2[@id='hTitleDeal']/span/p",
    'price': "//p[@class='pGiaTienID']/strong",
    'category': "",
    'description': "//div[@class='div_LBodyHL']",
    'images': "//div[@class='lof-main-outer']/img/@src",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'windeal.vn'
allowed_domains = ['windeal.vn']
start_urls = ['http://windeal.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/xem-san-pham/']), 'parse_item'),
    Rule(LinkExtractor(allow=['\?type']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Example #3
0
class HddznetSpider(CrawlSpider):

    __BASE_DOMAIN = 'www.hddznet.com'

    name = 'hddznet'
    allowed_domains = [ __BASE_DOMAIN ]
    start_urls = ['http://www.hddznet.com'] 
    rules = (        

        # 提取 产品中心
        Rule(LinkExtractor(allow=(r'product-.*.html$')), follow=True, callback='parse_product'),

        # 提取 方案与案例
        Rule(LinkExtractor(allow=(r'program-.*.html$')), follow=True, callback='parse_program'),        

        # 提取 经典案例
        Rule(LinkExtractor(allow=(r'news/detail.*-jdal.html$')), follow=True, callback='parse_case'),

        # 提取 新闻中心
        Rule(LinkExtractor(allow=(r'news/detail.*-xwzx.html$')), follow=True, callback='parse_news'),

        # # 提取 图片 .png .jpg .jpeg .bmp
        # Rule(LinkExtractor(allow=(r'www.hddznet.com'), deny_extensions=set(), tags=('img'), attrs=('src'), canonicalize=True, unique=True), \
        #     follow=False, callback='parse_images')

        # 提取 所有链接
        Rule(LinkExtractor(allow=(r'.*')), follow=True),        
    )

    def __init__(self, *args, **kwargs):
        super(HddznetSpider, self).__init__(*args, **kwargs)
        pool = redis.ConnectionPool(host='128.1.6.45', port=6379, decode_responses=True)
        self.redis = redis.Redis(connection_pool=pool) 

    def __parse(self, response, parse_name, title_class, content_class):
        cache_key = 'uri:url:{0}'.format(response.url)
        if self.redis.exists(cache_key):
            print('xx> SKIP', self.name, response.url, parse_name)
            return None

        title = self.__extract_title(response, '//div[@class="{0}"]/text()'.format(title_class))
        print('==>', self.name, response.url, parse_name, title)
                
        elements = response.xpath('//div[@class="{0}"]//span|//div[@class="{0}"]//p|//div[@class="{0}"]//td'.format(content_class))
        contents = elements.xpath('text()').extract()
        content = clear_content(contents)
        
        item = ContentItem()
        item['company'] = self.name
        item['title'] = title
        item['url'] = response.url
        item['content'] = content

        return item
 
    def parse_product(self, response):
        yield self.__parse(response, '产品中心', 'current-menu', 'right')
        
    def parse_program(self, response):
        yield self.__parse(response, '方案与案例', 'current-menu', 'right')     

    def parse_case(self, response):
        yield self.__parse(response, '经典案例', 'detail-title', 'detail-content')

    def parse_news(self, response):
        yield self.__parse(response, '新闻中心', 'detail-title', 'detail-content')

    # def parse_images(self, response):
    #     print('==>', self.name, '-Images-' , response.url)

    def __extract_title(self, response, xpath=None):        
        if xpath is None:
            return ''
        title = response.xpath(xpath).extract_first()
        return title.strip() if title is not None else ''
Example #4
0
class XjSpider(SpiderRedis):
    name = 'xinjiangdj'
    website = u'新疆党建网'
    download_delay = 0.1
    allowed_domains = ['www.xjkunlun.cn']
    start_urls = ['http://www.xjkunlun.cn/']

    rules = [
        # 只抓取2015以后的新闻, 太久远新闻存在内容抓取匹配问题
        Rule(LinkExtractor(allow=r'/201[5-9]/[0-9]+.htm',
                           deny=('/iptv', '/wlsp', '/mobile', '/kxj', '/xzzx',
                                 '/sy.xjkunlun', '/ycjy', '/djkw', '/index')),
             callback='parse_item',
             follow=False),
        Rule(LinkExtractor(allow=('/xinwen', '/gzgz', '/dswx', '/ldjh',
                                  '/dkyj', '/lgbgz', '/wnfw'),
                           deny=('/iptv', '/wlsp', '/mobile', '/kxj', '/xzzx',
                                 '/sy.xjkunlun', '/ycjy', '/djkw')),
             follow=True),
    ]

    def parse_item(self, response):
        loader = ItemLoader(item=SpiderItem(), response=response)
        contents = ''
        try:
            title = response.xpath(
                r'//td[@class="STYLE1"]/div//text()').extract_first()

            content_list = response.xpath(
                r'//div[@class="container"]/div[2]/table/tbody/tr/td/p/text()'
            ).extract()
            if len(content_list) == 0:
                # 定义另一种匹配形式
                content_list = response.xpath(
                    r'//*[@id="00010"]/table[2]/tbody/tr[2]/td/p/text()'
                ).extract()
            for content in content_list:
                contents = contents + content

            # 定义日期的两种匹配规则
            date_text = response.xpath(
                r'//*[@id="00010"]/table[2]/tbody/tr[1]/td/p/text() | //*[@id="00010"]/table[1]/tbody/tr[3]/td/text()'
            ).extract_first()
            match = re.search(r'(20[0-9]{2}-[0-9]{2}-[0-9]{2})', date_text)
            date = match.group(1)

            ### print info
            # try:
            # print 'title, ', title.encode('GB18030')
            # print 'url, ', response.url
            # print "date, ", date
            # print "content, ", contents.encode('GB18030')
            # except Exception as e:
            #     print " error : ", e

            loader.add_value('title', title)
            loader.add_value('date', date)
        except Exception as e:
            self.logger.error('error url: %s error msg: %s' %
                              (response.url, e))
            loader.add_value('date', '1970-01-01')
            loader.add_value('title', '')
        finally:
            #self.logger.info('crawling url: %s' % response.url)
            loader.add_value('url', response.url)
            loader.add_value('collection_name', self.name)
            loader.add_value('website', self.website)
            if contents == '':
                self.logger.warning(' url: %s msg: %s' %
                                    (response.url, ' content is None'))
            loader.add_value('content', contents)
            return loader.load_item()
Example #5
0
class XpahaSpider(RedisCrawlSpider):
    name = 'xpaha'
    allowed_domains = ['zidian.xpcha.com']
    start_urls = ['http://zidian.xpcha.com/']
    redis_key = "zidianspider:start_urls"

    # pagelink = LinkExtractor(allow=r'href="/hans/\w+')
    contentlink = LinkExtractor(allow=r'\w.html')

    rules = (
        # Rule(pagelink),
        Rule(contentlink, callback='parse_item'), )

    def parse_item(self, response):
        item = ZidainItem()
        item['zi'] = self.get_zi(response)
        item['thumb'] = self.get_thumb(response)
        item['pinyin'] = self.get_pinyin(response)
        item['wuxing'] = self.get_wuxing(response)
        item['jiegou'] = self.get_jiegou(response)
        item['bushou'] = self.get_bushou(response)
        item['bihua'] = self.get_bihua(response)
        item['base'] = self.get_base(response)
        item['kangxi'] = self.get_kangxi(response)
        item['guhanyu'] = self.get_guhanyu(response)
        item['xiangxi'] = self.get_xiangxi(response)
        item['develop'] = self.get_develop(response)
        item['request'] = response.url
        yield item

    def get_zi(self, response):
        try:
            item = response.xpath('/html/body/div[5]/div[1]/dl/dt[6]/a/text()'
                                  ).extract()[0].split()[-1]
        except Exception as e:
            item = ''

        if len(item):
            item
        return item

    def get_thumb(self, response):
        try:
            item = response.xpath(
                '/html/body/div[5]/div[1]/div[1]/img/@src').extract()[-1]
        except Exception as e:
            item = ''
        if len(item):
            item
        return item

    def get_pinyin(self, response):
        try:
            item = response.css(
                'body > div.body_1000 > div.left_leirong > h1').extract()[-1]
        except Exception as e:
            item = ''

        if len(item):
            item
        return item

    def get_wuxing(self, response):
        try:
            item = response.xpath(
                '/html/body/div[5]/div[1]/div[1]/dl/dd[3]/text()').extract(
                )[-1].split(':')[1]
        except Exception as e:
            item = ''

        if len(item):
            item
        return item

    def get_jiegou(self, response):
        try:
            item = response.xpath(
                '/html/body/div[5]/div[1]/div[1]/dl/dd[5]/text()').extract(
                )[-1].split(':')[1]
        except Exception as e:
            item = ''

        if len(item):
            item
        return item

    def get_bushou(self, response):
        try:
            item = response.xpath(
                '/html/body/div[5]/div[1]/div[1]/dl/dd[1]/text()').extract(
                )[-1].split(':')[1]
        except Exception as e:
            item = ''

        if len(item):
            item
        return item

    def get_bihua(self, response):
        try:
            item = response.xpath(
                '/html/body/div[5]/div[1]/div[1]/dl/dd[2]/text()').extract(
                )[-1].split(':')[1]
        except Exception as e:
            item = ''

        if len(item):
            item
        return item

    def get_base(self, response):
        try:
            item = response.xpath('//*[@id="jbjs"]').extract()[-1]
        except Exception as e:
            item = ''

        if len(item):
            item
        return item

    def get_kangxi(self, response):
        try:
            item = response.css('.zidian_tab a::attr(href)').extract()[2]
        except Exception as e:
            item = ''

        if len(item):
            item
        return item

    def get_guhanyu(self, response):
        try:
            item = response.xpath('//*[@id="ghyzd"]/div').extract()[-1]
        except Exception as e:
            item = ''

        if len(item):
            item
        return item

    def get_xiangxi(self, response):
        try:
            item = response.xpath('//*[@id="xxjs"]/div').extract()[-1]
        except Exception as e:
            item = ''
        if len(item):
            item
        return item

    def get_develop(self, response):
        try:
            item = response.xpath('//*[@id="jbjs"]/dl').extract()[-1]
        except Exception as e:
            item = ''

        if len(item):
            item
        return item
Example #6
0
class BbcspiderSpider(CrawlSpider):
    name = 'BBCspider'
    allowed_domains = ['www.bbc.com']
    start_urls = ['http://www.bbc.com/news',
    'https://www.bbc.com/news/stories'
    # 'https://www.bbc.com/news/world',
    # 'https://www.bbc.com/news/world/africa',
    # 'https://www.bbc.com/news/world/australia',
    # 'https://www.bbc.com/news/world/europe',
    # 'https://www.bbc.com/news/world/latin_america',
    # 'https://www.bbc.com/news/world/middle_east',
    # 'https://www.bbc.com/news/world/us_and_canada',
    # 'https://www.bbc.com/news/world/asia',
    # 'https://www.bbc.com/news/world/asia/china',
    # 'https://www.bbc.com/news/world/asia/india',
    # 'https://www.bbc.com/news/uk',

    ]

    rules = [
        Rule(
            LinkExtractor(allow=r'news', deny=denyrule, unique=True),
            callback='parse_item', follow=True    
        ),
        Rule(
            LinkExtractor(allow=r'https://traffic.outbrain.com/network', deny=denyrule, unique=True),
            callback='parse_item', follow=True
        )

    ]
    
    def start_requests(self):
        for url in self.start_urls:
            yield Request(url, callback=self.parse, dont_filter=True)

    def parse_item(self, response):
        item=BbcCrawlerItem()        
        c_type = response.xpath('//meta[@property="og:type"]/@content').extract_first()
        if (c_type == "article"):
            item['headline'] =response.xpath('//meta[@property="og:title"]/@content').extract_first()
            ar_author = response.xpath('//meta[@property="article:author"]/@content').extract_first()
            author = response.xpath('//meta[@name="author"]/@content').extract_first()
            item["author"]=ar_author if ar_author else author
            item["keywords"] = response.xpath('//div/ul[@class="tags-list"]/li[@class="tags-list__tags"]/a/text()').extract()
            # import pdb; pdb.set_trace()
            item["description"] = response.xpath('//meta[@name="description"]/@content').extract_first()
            body_sc = response.xpath("//div[@class='story-body__inner']")
            if len(body_sc) > 0:
                list_text=body_sc.xpath(delete_cotent).xpath("string(.)").extract()
                text = '\n'.join(list_text).strip()
            else:
                body_sc = response.xpath("//div[contains(@class,'main_article_text')]")
                list_text=body_sc.xpath(delete_cotent).xpath("string(.)").extract()
                text='\n'.join(list_text).strip()
            item['text']=text
            
            item["viewtime"] = datetime.utcnow()
            item["url"] = response.url
            sha1.update(item['text'].encode('utf-8'))
            item["sha1"]=str(sha1.hexdigest())
            # text.replace('\n', '') 
            
            yield item
Example #7
0
class ShudCrawler(scrapy.Spider):
    name = "amazon"
    config = configparser.ConfigParser()
    config.read('../shud.ini')
    
    sparkSession = SparkSession \
            .builder \
            .appName(config.get('spark', 'appname')) \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()
            
    sqlContext = SQLContext(sparkSession)

     # Spider Name 
        #self.config.get('crawling', 'spidername')
    
    # The domains that are allowed (links to other domains are skipped)
    allowed_domains = config.get('crawling', 'allowedDomain')
    # The URLs to start with
    start_urls = config.get('crawling', 'startUrl')
    
    # This spider has one rule: extract all (unique and canonicalized) links, follow them and parse them using the parse_items method
    rules = [
        Rule(
            LinkExtractor(canonicalize=False,
                unique=True
            ),
            follow=True,
            callback="parse_items"
        )
    ]    
        
    def start_requests(self):
        print("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^")
        #TODO
        #Parcourir la liste de start urls à crawler et la mettre en mémore avec parsed=false
        initUrlList = [(self.config.get('crawling', 'startUrl'), "false")]
        df = self.sparkSession.createDataFrame(initUrlList, schema=["url", "crawled"])
        self.sqlContext.registerDataFrameAsTable(df, "WorkTable")
        

        
        indx = 0
        urlListe = self.sqlContext.sql("SELECT url from WorkTable where crawled = 'false'")
        
        while len(urlListe.rdd.collect()) > 0:
            print("####################### Current step = %s " %str(indx))
            
            for url in urlListe.rdd.collect():
                print("************************** Current url = %s " %str(url))                
                #TODO
                #Vérifier que l'url contient au moins un des allowed domain                
               # if self.config.get('crawling', 'allowedDomain') in url[0]:
                    
                a=url[0]
                try:
                    self.parse2(self, a, indx)
                except:
                    pass
                #yield scrapy.Request(url=str(a), callback=self.parse)
                    
            print("************************** Current url = %s " %str(url))
            urlListe = self.sqlContext.sql("SELECT url from WorkTable where crawled = 'false'")                    
            
            print("************************** DEBUTs WorkTable " )
            Myliste = self.sqlContext.sql("SELECT * from WorkTable")
            print(Myliste.show()) 
            print("************************** FIN WorkTable")
            
            if indx > 4:
                break
                
            indx += 1
            
            

    def parse(self, response):        
        print("%%%%%%% Current url = %s " %response.url)

        newUrls = []
        items = []
        # Only extract canonicalized and unique links (with respect to the current page)
        links = LinkExtractor(canonicalize=False, unique=True).extract_links(response)
        # Now go through all the found links
        for link in links:            
            # Check whether the domain of the URL of the link is allowed; so whether it is in one of the allowed domains
            is_allowed = False
            for allowed_domain in self.allowed_domains:
                if allowed_domain in link.url:
                    is_allowed = True
            # If it is allowed, append the url to the list
            if is_allowed:
                newUrls.append((link.url, "false")) 
            #Get all urls to synchronize and update
            df = self.sqlContext\
                .sql("SELECT url, crawled from WorkTable where url <>'%s'" % response.url)\
                .union(self.sparkSession.createDataFrame(newUrls))\
                .union(self.sparkSession.createDataFrame([(response.url, "true")]))\
                .dropDuplicates(['url'])
        
        self.sqlContext.dropTempTable("WorkTable")
        self.sqlContext.registerDataFrameAsTable(df, "WorkTable")
        
        #print(df.show())
        # TODO
        #Put response body's content into RDDs
        #page = response.url.split("/")[-2]
        #page = response.url
        m=hashlib.md5(bytes(str(response.url),"ascii"))   # python 3                
        filename = str(self.name)+'_'+ m.hexdigest() + '.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        self.log('Saved file %s' % filename)
        item = ShudScraperItem()
        item['url_from'] = response.url
        items.append(item)
        yield item
class BusinessInsiderSpider(CommonBaseSpider):
    name = "businessinsider"
    base_dir = "crawl"
    allowed_domains = ["businessinsider.in"]
    urls = [
        filter(None, item['subcategory'].values()) for item in categories
        if filter(None, item['subcategory'].values())
    ]
    urls = sum(
        sum(urls, []), []
    )  ## i.e. similar to [item for sublist in urls for subsublist in sublist for item in subsublist]
    start_urls = urls

    rules = (Rule(LinkExtractor(
        allow=(r'http\:\/\/www\.businessinsider\.in\/.+\.cms', )),
                  callback='parse_item',
                  follow=False), )

    def parse_item(self, response):
        super(BusinessInsiderSpider, self).parse_item(response)
        htmlparser = etree.HTMLParser()
        tree = etree.parse(BytesIO(response.body), htmlparser)

        news_item = NewsItem()
        try:

            # title = tree.xpath('//*[@id="Content"]/div[3]/div[3]/div[1]/div/div[1]/div/article/div[1]/h1/text()')
            title = tree.xpath(
                '//*[@id="Content"]/div[3]/div[3]/div[1]/div/div[1]/div/article/div[1]/h1//text()'
            )
            # details = tree.xpath('.//div[contains(@class,\'section1\')]//p//text()')
            details = tree.xpath(
                './/div[contains(@class,"hide_show_handler main_content")]//p//text()'
            )

            if title and details:
                news_item['source'] = self.name
                news_item['source_url'] = response.url.split('?')[0]
                news_item['crawled_date'] = datetime.now()
                news_item['title'] = title[0].strip().encode('ascii', 'ignore')
                news_item['details'] = "\t".join([
                    item.strip().encode('ascii', 'ignore') for item in details
                ])

                img_urls = tree.xpath(
                    './/div[contains(@class,\'MeetingImg blk\')]/img/@src')
                img_url_list = []
                if img_urls:
                    for img_url in img_urls:
                        img_url_list.append("http://www.businessinsider.in" +
                                            img_url)
                    news_item['img_urls'] = get_stripped_list(img_url_list)

                published_date = tree.xpath(
                    './/div[contains(@class,\'ByLine\')]//span[contains(@class,\'Date\')]//text()'
                )
                if published_date:
                    news_item['published_date'] = datetime.strptime(
                        get_stripped_list(published_date)[0],
                        '%b %d, %Y, %I.%M %p')

                author = tree.xpath('.//a[contains(@class,\'Name\')]/text()')
                if author:
                    news_item['author'] = get_stripped_list(author)

                tags = tree.xpath(
                    './/span[contains(@class,\'anchorLink\')]/text()')
                more_tags = tree.xpath(
                    './/div[contains(@id,\'commentHash\')]//a/text()')
                if tags:
                    news_item['tags'] = get_stripped_list(tags)
                if more_tags:
                    news_item['tags'] = get_stripped_list(more_tags)

                cover_image = tree.xpath(
                    './/div[contains(@class,\'MeetingImg blk\')]/img/@src')
                if cover_image:
                    news_item['cover_image'] = img_url_list[0]
                    # get_stripped_list(cover_image)

                referer = response.request.headers['Referer']
                for item in categories:
                    if referer in sum(item['subcategory'].values(), []):
                        news_item['category'] = item['category']
                        key = (key
                               for key, value in item['subcategory'].items()
                               if referer in value).next()
                        news_item['sub_categories'] = [key]
                return news_item

        except:
            self.log('==Exception=================>>>>>>>>! %r' % e)
        return None
Example #9
0
class VansCrawler(CrawlSpider, Mixin):
    name = Mixin.retailer + "-crawler"
    parser = VansParser()
    listings_css = [".topnav-main-item"]
    product_css = [".product-block-figure"]
    deny_re = [".html"]
    PAGE_SIZE = 48

    rules = (Rule(LinkExtractor(restrict_css=listings_css, deny=deny_re),
                  callback="parse_pagination"), )

    def category_zones(self, response):
        css = ".body-container div::attr(lmzone)"
        return response.css(css).extract()

    def site_id(self, response):
        script_re = "WCS_CONFIG.ATTRAQT = (.+?);"
        raw_site_id = json.loads(
            re.findall(script_re,
                       response.body.decode("utf-8").replace("\n", ""))[0])
        return re.findall("zones/(.*).min", raw_site_id["MAINJS"])[0]

    def config_categorytree(self, response):
        return re.findall('categorytree : "(.*)"',
                          response.body.decode("utf-8"))[0]

    def config_language(self, response):
        css = "meta[name='locale']::attr(content)"
        return response.css(css).extract_first()

    def parse_pagination(self, response):
        pages = self.page_count(response)
        cat_zones = self.category_zones(response)
        lang = self.config_language(response)

        parameters = {
            "zone0": cat_zones[0],
            "zone1": cat_zones[1],
            "mergehash": "true",
            "config_categorytree": self.config_categorytree(response),
            "siteId": self.site_id(response),
            "config_language": lang,
            "language": lang,
            "config_country": self.market
        }

        for page in range(0, pages + self.PAGE_SIZE, self.PAGE_SIZE):
            parameters[
                "pageurl"] = f"{response.url}#esp_pg={page//self.PAGE_SIZE}"
            url = self.pagniation_req_url_t.format(urlencode(parameters))

            yield Request(url,
                          callback=self.parse_raw_content,
                          dont_filter=True)

    def parse_raw_content(self, response):
        script_re = "LM.buildZone\((.*)\)"
        raw_html = json.loads(
            re.findall(script_re, response.body.decode("utf-8"))[0])
        new_response = response.replace(body=raw_html["html"])

        return [
            Request(url, callback=self.parse_item)
            for url in self.product_urls(new_response)
        ]

    def parse_item(self, response):
        return self.parser.parse_product(response)

    def product_urls(self, response):
        css = ".product-block-pdp-url::attr(href)"
        urls = response.css(css).extract()
        return [f"{self.start_urls[0]}{url}" for url in urls]

    def page_count(self, response):
        css = ".header-result-counter ::text"
        return int(response.css(css).re_first("\d+") or '0')
Example #10
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name' : "//div[@class='Box_SP_detail']/div[@class='Box_SP_detail_right']/h2",
    'price' : "//div[@class='Box_SP_detail']/div[@class='Box_SP_detail_right']/p/b/span",
    'category' : "",
    'description' : "//div[@class='Page_left_2']/div[@class='Box_content']/div[@id='content']",
    'images' : "//div[@class='Box_SP_detail_left']/h3/a/@href",
    'canonical' : "",
    'base_url' : "",
    'brand' : ""
}
name = 'daotoan.com'
allowed_domains = ['daotoan.com']
start_urls = ['http://daotoan.com']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-_]+-id+\.aspx']), 'parse_item'),
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-_]+-in+\.aspx']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
class ShclearingSpider(CrawlSpider):
    name = 'shclearing'
    source = "上海清算所"
    allowed_domains = ["shclearing.com"]
    yesterday = datetime.date.today() - datetime.timedelta(days=1)
    yesterday = yesterday.strftime('%Y%m%d')
    reg = yesterday
    start_urls = ['http://www.shclearing.com/cpyyw/tzgg/']
    rules = (
        Rule(LinkExtractor(allow=reg), callback="parse_news", follow=True),
        # Rule(LinkExtractor(allow='_[0-9].html'))
    )

    def printcn(uni):
        for i in uni:
            print uni.encode('utf-8')

    def parse_news(self, response):
        item = GenericItem()
        self.get_id(response, item)
        self.get_url(response, item)
        self.get_source(response, item)
        self.get_title(response, item)
        self.get_date(response, item)
        self.get_body(response, item)
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse
        if item['body']:
            return item

    def get_id(self, response, item):
        id = uuid.uuid4()
        if id:
            item['id'] = id

    def get_url(self, response, item):
        news_url = response.url
        if news_url:
            item['url'] = news_url

    def get_source(self, response, item):
        source = self.source
        if source:
            item['source'] = source

    def get_title(self, response, item):
        title = response.xpath('//h1[@id="title"]/text()').extract()
        if title:
            item['title'] = ''.join(title).strip()

    def get_date(self, response, item):
        item['date'] = self.yesterday + '000000'

    def get_body(self, response, item):
        paras = response.xpath('//div[@class="TRS_Editor"]/p')
        news_body = ''
        for p in paras:
            data = p.xpath('string(.)').extract()
            if data:
                body = ''
                for line in ''.join(data).splitlines():
                    #   print entry.encode('utf-8')
                    body += line.strip()
                news_body += body + '_|_'
        item['body'] = news_body.replace('_|__|_', '_|_')
Example #12
0
class X163Spider(CrawlSpider):
    name = '163'
    source = "网易财经"
    allowed_domains = ["163.com"]
    yesterday = datetime.date.today() - datetime.timedelta(days=1)
    yesterday = yesterday.strftime('%Y/%m%d')[2:]
    reg = yesterday
    start_urls = [
        'http://money.163.com/special/00252G50/macro.html',
        'http://money.163.com/special/00252C1E/gjcj.html'
    ]
    rules = (Rule(LinkExtractor(allow=reg), callback="parse_news",
                  follow=True), Rule(LinkExtractor(allow='macro_')),
             Rule(LinkExtractor(allow='gjcj_')))

    def printcn(uni):
        for i in uni:
            print uni.encode('utf-8')

    def parse_news(self, response):
        item = GenericItem()
        self.get_id(response, item)
        self.get_url(response, item)
        self.get_source(response, item)
        self.get_title(response, item)
        self.get_date(response, item)
        self.get_body(response, item)
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse
        if item['body']:
            return item

    def get_id(self, response, item):
        id = uuid.uuid4()
        if id:
            item['id'] = id

    def get_url(self, response, item):
        news_url = response.url
        if news_url:
            item['url'] = news_url

    def get_source(self, response, item):
        source = self.source
        if source:
            item['source'] = source

    def get_title(self, response, item):
        title = response.xpath(
            '//div[@id="epContentLeft"]/h1/text()').extract()
        if title:
            item['title'] = title

    def get_date(self, response, item):
        date = response.xpath(
            '//div[@class="post_time_source"]/text()').extract()[0]
        if date:
            item['date'] = ''.join(date).replace(u'-', u'').replace(
                u':', u'').replace(u' ', u'').strip()[:14]

    def get_body(self, response, item):
        paras = response.xpath('//div[@id="endText"]/p')
        news_body = ''
        for p in paras:
            data = p.xpath('string(.)').extract()
            if data:
                body = ''
                for line in ''.join(data).splitlines():
                    #   print entry.encode('utf-8')
                    body += line.strip()
                news_body += body + '_|_'
        item['body'] = news_body.replace('_|__|_', '_|_')
Example #13
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//div[2]/h1/a",
    'price':
    "//h3[@class='views-field views-field-display-price']/span[@class='field-content']",
    'category': "//div[@class='breadcrumb']/a",
    'description':
    "//div[@class='views-field views-field-body']/div[@class='field-content']",
    'images': "//div[@class='field-item even']/a/img/@src",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'khoe24.vn'
allowed_domains = ['khoe24.vn']
start_urls = ['http://khoe24.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/thuoc/']), 'parse_item'),
    Rule(LinkExtractor(deny=['/']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Example #14
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name' : "//div[@class='product-name']/h1",
    'price' : "//div[@class='product-quick-view']/div[@class='left-info']/div[@class='price-box']/span/span",
    'category' : "//div[@class='breadcrumbs']/ul/li/a",
    'description' : "//div[@class='box-collateral box-description']",
    'images' : "//img[@id='zoom_03']/@data-zoom-image",
    'canonical' : "",
    'base_url' : "",
    'brand' : ""
}
name = 'kiddymart.vn'
allowed_domains = ['kiddymart.vn']
start_urls = ['http://kiddymart.vn']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(), 'parse_item'),
    Rule(LinkExtractor(), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Example #15
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//div[@class='prod_content']/h1[@id='prod_title']",
    'price': "//div[@class='final_price']/span[@id='special_price_box']",
    'category': "//div[@class='bcr box breadcrumbs']/ul/li/a",
    'description': "//div[@class='prod_details']",
    'images': "//div[@id='prdMedia']/div[@id='img_large']/div/a/@href",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'anhchinh.vn'
allowed_domains = ['anhchinh.vn']
start_urls = ['http://www.anhchinh.vn']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-_]+_id+\d+\.html']), 'parse_item'),
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-_]+_dm+\d+\.html'], deny=['\?']),
         'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Example #16
0
class UrbanDictSpider(CrawlSpider):
    name = "urbandict"
    allowed_domains = ["urbandictionary.com"]
    start_urls = [
        "http://www.urbandictionary.com/",
        "http://www.urbandictionary.com/popular.php?character=Q",
        "http://www.urbandictionary.com/popular.php?character=W",
        "http://www.urbandictionary.com/popular.php?character=E",
        "http://www.urbandictionary.com/popular.php?character=R",
        "http://www.urbandictionary.com/popular.php?character=T",
        "http://www.urbandictionary.com/popular.php?character=Y",
        "http://www.urbandictionary.com/popular.php?character=U",
        "http://www.urbandictionary.com/popular.php?character=I",
        "http://www.urbandictionary.com/popular.php?character=O",
        "http://www.urbandictionary.com/popular.php?character=P",
        "http://www.urbandictionary.com/popular.php?character=A",
        "http://www.urbandictionary.com/popular.php?character=S",
        "http://www.urbandictionary.com/popular.php?character=D",
        "http://www.urbandictionary.com/popular.php?character=F",
        "http://www.urbandictionary.com/popular.php?character=G",
        "http://www.urbandictionary.com/popular.php?character=H",
        "http://www.urbandictionary.com/popular.php?character=J",
        "http://www.urbandictionary.com/popular.php?character=K",
        "http://www.urbandictionary.com/popular.php?character=L",
        "http://www.urbandictionary.com/popular.php?character=Z",
        "http://www.urbandictionary.com/popular.php?character=X",
        "http://www.urbandictionary.com/popular.php?character=C",
        "http://www.urbandictionary.com/popular.php?character=V",
        "http://www.urbandictionary.com/popular.php?character=B",
        "http://www.urbandictionary.com/popular.php?character=N",
        "http://www.urbandictionary.com/popular.php?character=M",
        "http://www.urbandictionary.com/yesterday.php",
    ]

    rules = (Rule(LinkExtractor(allow=("browse", "popular", "yesterday",
                                       "define", "favorites")),
                  callback="parse_items",
                  follow=True), )

    def parse_items(self, response):
        """
        
        """
        sel = Selector(response)
        items = []
        """if "urbandictionary.com/define" not in response.url:
            return items"""

        sites = sel.xpath('//div[@id="content"]/div[@class="def-panel"][1]')
        for site in sites:
            item = DomainNameScraperItem()
            item['word'] = site.xpath(
                'div[@class="def-header"]/a[@class="word"]/text()').extract()
            item['meaning'] = site.xpath(
                'div[@class="meaning"]/text()').extract()
            item['example'] = site.xpath(
                'div[@class="example"]/text()').extract()
            item['pos'] = site.xpath(
                './/a[@class="thumb up"]/span[@class="count"]/text()').extract(
                )
            item['neg'] = site.xpath(
                './/a[@class="thumb down"]/span[@class="count"]/text()'
            ).extract()
            item["source"] = "urbandictonary"
            items.append(item)

        return items
Example #17
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//div/div/div/h1",
    'price': "//tr/td/font",
    'category': "//div[@id='yarnball']/ul[@class='yarnball']/li/a",
    'description':
    "//div[@id='centerid']/div[@id='content']/div[@id='contentid']/div[@class='viewpron']",
    'images': "//td[@id='proimg']/img/@src",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'alobaby.com.vn'
allowed_domains = ['alobaby.com.vn']
start_urls = ['http://alobaby.com.vn']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/\d+/\d+/']), 'parse_item'),
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+-\d+\.html']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Example #18
0
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//h1",
    'price': "//div[@class='product-price-pomotion']/p[@class='price']",
    'category':
    "//div[@class='product-category-path']/ul[@class='list-item']/li/a",
    'description': "//div[@class='pr-l-snv-i active']",
    'images': "//div[@id='gallery']/a/@href",
    'canonical': "",
    'base_url': "",
    'brand': "",
    'in_stock': "",
    'guarantee': ""
}
name = 'haiphongtelecom.com'
allowed_domains = ['haiphongtelecom.com']
start_urls = ['http://haiphongtelecom.com/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = ['']
rules = [
    #Rule(LinkExtractor(), 'parse_item'),
    #Rule(LinkExtractor(), 'parse'),
    Rule(
        LinkExtractor(allow=[
            '/[a-zA-Z0-9-]+\.html($|\?page=\d+&sort=goods_id&order=DESC#goods_list$)'
        ]), 'parse_item_and_links'),
]
Example #19
0
class IndependentSpider(CrawlSpider):
    name = "independent"
    allowed_domains = ["independent.co.uk"]

    def __init__(self, yearmonth='', *args, **kwargs):
        super(IndependentSpider, self).__init__(*args, **kwargs)
        begin_date = pd.Timestamp(yearmonth + "-01")
        end_date = pd.Timestamp(begin_date) + pd.DateOffset(
            months=1) - pd.DateOffset(days=1)
        date_inds = [
            d.date().isoformat() for d in pd.date_range(begin_date, end_date)
        ]
        self.start_urls = [
            "http://www.independent.co.uk/archive/%s" % d for d in date_inds
        ]

    rules = (Rule(LinkExtractor(
        allow=(),
        restrict_xpaths=('//ol[@class="margin archive-news-list"]/li/a', )),
                  callback="parse_items",
                  follow=True), )

    def parse_items(self, response):
        hxs = HtmlXPathSelector(response)
        item = NewsItem()
        item["link"] = response.request.url
        item["lang"] = "en"
        item["source"] = "independent"

        title = hxs.xpath('//h1[@itemprop="headline"]/text()').extract()
        intro = hxs.xpath('//div[@class="intro"]/p/text()').extract()
        author = hxs.xpath('//span[@itemprop="name"]/a/text()').extract()
        category = hxs.xpath(
            '//ol[@class="breadcrumbs clearfix"]//a//text()').extract()
        new_content = hxs.xpath(
            '//div[@itemprop="articleBody"]/p//text()').extract()
        date_time = hxs.xpath(
            '//ul[@class="caption meta inline-pipes-list"]//time/@datetime'
        ).extract()
        #
        # Processing outputs
        author = [re.sub('^By\s', '', a) for a in author]
        author = [re.sub('\sin\s.*', '', a) for a in author]
        new_content = [p for p in new_content if not re.search(u'\u2022', p)]
        new_content = [
            p for p in new_content
            if not re.search('font-family|background-color:', p)
        ]
        new_content = ' '.join(new_content)
        new_content = re.sub('\n', '', new_content)
        item["content"] = re.sub('\s{2,}', ' ', new_content)
        author = '|'.join(author)
        item["category"] = '|'.join(category)
        item["intro"] = ' '.join(intro)
        item["title"] = ' '.join(title)
        datte = re.findall('[0-9]+.[0-9]+.[0-9]+', date_time[0])[0]
        tme = re.findall('[0-9]+:[0-9]+', date_time[0])[0]
        datte = datte.split('/')
        item["date_time"] = datte[2] + '-' + datte[1] + '-' + datte[
            0] + 'T' + tme
        item["author"] = author

        return (item)
Example #20
0
class MovieSpider(CrawlSpider):
    name = 'movie'
    allowed_domains = ['www.id97.com']
    start_urls = ['http://www.id97.com/movie/']

    # 自己定制配置文件的某些选项
    # custom_settings = {
    #     'ITEM_PIPELINES' : {
    #         'demo1.pipelines.MyMongoDbPipeline': 302,
    #     }
    # }
    # 根据规则提取所有的页码链接
    page_link = LinkExtractor(allow=r'/movie/\?page=\d')
    detail_link = LinkExtractor(
        restrict_xpaths='//div[contains(@class,"col-xs-1-5")]/div/a')
    # follow : 是否跟进
    rules = (
        # 所有的页码不用处理,跟进即可
        Rule(page_link, follow=True),
        # 所有的详情页不用跟进
        Rule(detail_link, callback='parse_item', follow=False),
    )

    def parse_item(self, response):
        # 通过response的meta属性,获取到参数item
        item = Demo1Item()
        # 电影海报
        item['post'] = response.xpath(
            '//a[@class="movie-post"]/img/@src').extract_first()
        # 电影名字
        item['name'] = response.xpath('//h1').xpath(
            'string(.)').extract_first()
        # 电影评分
        item['score'] = response.xpath(
            '//div[@class="col-xs-8"]/table/tbody/tr[last()]/td[2]').xpath(
                'string(.)').extract_first()
        # 电影类型
        item['_type'] = response.xpath(
            '//div[@class="col-xs-8"]/table/tbody/tr[3]/td[2]').xpath(
                'string(.)').extract_first()
        # 导演
        item['director'] = response.xpath(
            '//div[@class="col-xs-8"]/table/tbody/tr[1]/td[2]/a/text()'
        ).extract_first()
        # 编剧
        item['editor'] = response.xpath(
            '//div[@class="col-xs-8"]/table/tbody/tr[2]/td[2]/a/text()'
        ).extract_first()
        # 主演
        # '张静初 / 龙品旭 / 黎兆丰 / 王同辉 / 张国强 / 叶婉娴 / 丽娜 / 吴海燕 / 吴若林 / 喻引娣 显示全部'
        item['actor'] = response.xpath(
            '//div[@class="col-xs-8"]/table/tbody/tr[3]/td[2]').xpath(
                'string(.)').extract_first().replace(' ',
                                                     '').replace('显示全部', '')
        # 片长
        lala = response.xpath(
            '//div[@class="col-xs-8"]/table/tbody/tr[8]/td[2]/text()'
        ).extract_first()
        if lala and ('分钟' in lala):
            item['long_time'] = lala
        else:
            item['long_time'] = ''
        # 电影介绍
        introduce = response.xpath('//div[@class="col-xs-12 movie-introduce"]'
                                   ).xpath('string(.)').extract_first()
        if introduce == None:
            item['introduce'] = ''
        else:
            item['introduce'] = introduce.replace('\u3000',
                                                  '').replace('展开全部', '')
        # 电影链接
        # item['download_url'] = response.xpath('')
        yield item
Example #21
0
class Iosrpgcrawler(CrawlSpider):

    name = 'iosrpgcrawler'
    allowed_domains = ['apps.apple.com']
    start_urls = [
        'https://apps.apple.com/us/genre/ios-games-role-playing/id7014?letter=A'
    ]

    rules = (
        # paginate by letter
        Rule(
            LinkExtractor(
                allow='genre/ios-games-role-playing/id7014\?letter=(\D)')),
        # paginate to next page
        Rule(
            LinkExtractor(
                allow=
                'genre/ios-games-role-playing/id7014\?letter=(\D)&page=(\d+)#page',
                restrict_xpaths='//a[@class="paginate-more"]')),
        # go to actuall app description page
        Rule(LinkExtractor(allow='app\/(.+)\/id(\d+)',
                           restrict_xpaths='//div[@id="selectedcontent"]'),
             callback='parse_game'),
    )

    def parse_game(self, response):

        il = GameItemLoader(item=Game(), response=response)

        # basic information
        il.add_xpath(
            'title',
            '//h1[@class="product-header__title app-header__title"]/text()')
        il.add_xpath(
            'subtitle',
            '//h2[@class="product-header__subtitle app-header__subtitle"]/text()'
        )
        il.add_xpath(
            'author',
            '//h2[@class="product-header__identity app-header__identity"]/a/text()'
        )
        il.add_xpath(
            'price',
            '//li[@class="inline-list__item inline-list__item--bulleted app-header__list__item--price"]/text()'
        )
        il.add_xpath(
            'iap',
            '//li[@class="inline-list__item inline-list__item--bulleted app-header__list__item--in-app-purchase"]/text()'
        )
        il.add_xpath('age',
                     '//span[@class="badge badge--product-title"]/text()')
        il.add_xpath('desc', '//div[@class="section__description"]//p/text()')

        # game popularity and reception
        il.add_xpath('list_rank', '//li[@class="inline-list__item"]/text()')
        il.add_xpath(
            'score',
            '//span[@class="we-customer-ratings__averages__display"]/text()')
        il.add_xpath(
            'nrating',
            '//div[@class="we-customer-ratings__count small-hide medium-show"]/text()'
        )
        il.add_xpath('stars',
                     '//div[@class="we-star-bar-graph__row"]/div/div/@style')

        # other details
        il.add_xpath(
            'editor',
            '//div[@class="we-editor-notes lockup ember-view"]/div/h3/text()')
        il.add_xpath(
            'seller',
            '//dl[@class="information-list information-list--app medium-columns"]/div[1]/dd[@class="information-list__item__definition l-column medium-9 large-6"]/text()'
        )
        il.add_xpath(
            'size',
            '//dl[@class="information-list information-list--app medium-columns"]/div[2]/dd[@class="information-list__item__definition l-column medium-9 large-6"]/text()'
        )
        il.add_xpath(
            'category',
            '//dl[@class="information-list information-list--app medium-columns"]/div[3]/dd/a/text()'
        )
        il.add_xpath(
            'compat',
            '//dl[@class="information-list information-list--app medium-columns"]//p/text()'
        )
        il.add_xpath(
            'lang',
            '//dl[@class="information-list information-list--app medium-columns"]//p/text()'
        )
        il.add_xpath(
            'age_copy',
            '//dl[@class="information-list information-list--app medium-columns"]/div//dd/text()'
        )
        il.add_xpath(
            'support',
            '//div[@class="supports-list__item__copy"]/h3[@dir="ltr"]/text()')

        return il.load_item()
Example #22
0
class BaiduSpider(RedisCrawlSpider):
    task_queue = baidu_task_queue
    base_url = "https://baike.baidu.com"
    name = baidu_spider_name
    allowed_domains = ['baike.baidu.com']
    rules = (Rule(LinkExtractor(allow=('https://baike.baidu.com/item/', )),
                  callback='parse',
                  follow=True), )

    # custom_settings = {
    #     'ITEM_PIPELINES': {
    #         'baikeSpider.pipelines.SpiderPipeline': 300,
    #         'baikeSpider.pipelines.SpiderRedisPipeline': 301,
    #         'baikeSpider.pipelines.WebCachePipeline': 302,
    #     },
    # }

    def parse(self, response):
        items = BaiduSpiderItem()
        selector = Selector(response)
        # print(response.status, response)
        items['url'] = unquote(response.url)
        items['html'] = response.text
        title = selector.xpath("/html/head/title/text()").extract()
        if title:
            items['title'] = title[0].strip().encode(
                'utf-8', errors='ignore').decode('utf-8')
        else:
            items['title'] = ''
        summary = selector.xpath("//div[@class=\"lemma-summary\"]").xpath(
            "string(.)").extract()
        if summary:
            tmps = summary[0].encode('utf-8', errors='ignore').decode('utf-8')
            items['summary'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n', tmps)
        else:
            items['summary'] = ''

        basic_info = selector.xpath("//div[@class=\"basic-info cmn-clearfix\"]"
                                    ).xpath("string(.)").extract()
        if basic_info:
            tmpb = basic_info[0].encode('utf-8',
                                        errors='ignore').decode('utf-8')
            items['basic_info'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n',
                                         tmpb)
        else:
            items['basic_info'] = ''

        catalog = selector.xpath("//div[@class=\"lemmaWgt-lemmaCatalog\"]"
                                 ).xpath("string(.)").extract()
        if catalog:
            tmpc = catalog[0].encode('utf-8', errors='ignore').decode('utf-8')
            items['catalog'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n', tmpc)
        else:
            items['catalog'] = ''

        # 进行迭代抓取的item链接
        urls = [
            unquote(item) for item in selector.xpath(
                "//div[@class=\"para\"]//a[@target=\"_blank\"]/@href").extract(
                )
        ]
        items['keywords_url'] = list(set(filter(lambda x: 'item' in x, urls)))

        description = selector.xpath(
            "//div[@class=\"content-wrapper\"]").xpath("string(.)").extract()
        if description:
            tmpd = description[0].encode('utf-8',
                                         errors='ignore').decode('utf-8')
            items['description'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n',
                                          tmpd)
        else:
            items['description'] = ''

        # 匹配pic、js、css
        items['embed_image_url'] = CacheTool.parse_img(items['html'])
        items['js'] = CacheTool.parse_js(items['html'])
        items['css'] = CacheTool.parse_css(items['html'])

        album_pic_url = selector.xpath(
            "//div[@class=\"album-list\"]//a[@class=\"more-link\"]/@href"
        ).extract()
        if album_pic_url:
            items['album_pic_url'] = self.base_url + unquote(album_pic_url[0])
        else:
            items['album_pic_url'] = ''

        update_time = selector.xpath(
            "//span[@class = 'j-modified-time']").xpath("string(.)").extract()
        if update_time:
            tmpu = update_time[0].strip().encode(
                'utf-8', errors='ignore').decode('utf-8')
            items['update_time'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n',
                                          tmpu)
        else:
            items['update_time'] = ''

        reference_material = selector.xpath(
            "//dl[@class ='lemma-reference collapse nslog-area log-set-param']"
        ).xpath("string(.)").extract()
        if reference_material:
            tmpr = reference_material[0].encode(
                'utf-8', errors='ignore').decode('utf-8')
            items['reference_material'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}',
                                                 '\n', tmpr)
            # print(items['reference_material'])
        else:
            items['reference_material'] = ''

        item_tag = selector.xpath("//dd[@id = \"open-tag-item\"]").xpath(
            "string(.)").extract()
        if item_tag:
            tmpi = item_tag[0].encode('utf-8', errors='ignore').decode('utf-8')
            items['item_tag'] = re.sub('(\r\n){2,}|\n{2,}|\r{2,}', '\n', tmpi)
        else:
            items['item_tag'] = ''
        print('百度百科爬虫==》', items['title'])
        yield copy.deepcopy(
            items)  # 深拷贝的目的是默认浅拷贝item会在后面的pipelines传递过程中会出现错误,比如串数据了
Example #23
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name':
    "//div[@class='summary entry-summary']/h1[@class='product_title entry-title']",
    'price': "//p[@class='price']/span[@class='amount']",
    'category': "//div[@class='breadcrumb-trail']/a",
    'description':
    "//div[@class='woocommerce-tabs']/div[@id='tab-description']/div/div/p",
    'images': "//img[@class='attachment-shop_single wp-post-image']/@src",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'phukieniphone.info'
allowed_domains = ['phukieniphone.info']
start_urls = ['http://phukieniphone.info/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/9/[a-z0-9-]']), 'parse_item'),
    Rule(LinkExtractor(deny=['/9/[a-z0-9-]', 'add_to_cart']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Example #24
0
class ShopindexSpider(CrawlSpider):
    name = 'shopindex'
    allowed_domains = ['nuomi.com', 'dianping.com', 'cq.meituan.com']
    start_urls = [
        # 'https://www.dianping.com/shop/24098260'
        # 'http://cq.meituan.com/shop/82458075'
        # ,'http://www.nuomi.com/deal/d3ccslof.html'
        # ,'https://www.dianping.com/shop/32463358'
    ]

    # settings = get_project_settings()

    downLoadUrlsFile = '../hlwdata/data/start_url.txt'
    startUrlsFile = '../hlwdata/data/downloaded_url.txt'
    lst = loadUrl(downLoadUrlsFile)

    rules = (
        Rule(FilterLinkExtractor(allow=r'http://www.nuomi.com/deal/[\w]+',
                                 download=lst),
             callback='parse_nuomi',
             follow=True),
        Rule(FilterLinkExtractor(allow=r'https://www.dianping.com/shop/[\d]+$',
                                 download=lst),
             callback='parse_dianping',
             follow=True),
        Rule(FilterLinkExtractor(
            allow=r'http://cq.meituan.com/shop/[\d]+\.*[\w]*$', download=lst),
             callback='parse_meituan',
             process_links='link_filtering',
             follow=True),
    )

    def link_filtering(self, links):
        for link in links:
            link.url = link.url.rstrip('.html')
        return links

    visitedShop = set()

    def start_requests(self):

        for url in loadUrl(self.startUrlsFile):
            yield self.make_requests_from_url(url)
        for url in self.start_urls:
            yield self.make_requests_from_url(url)

    def parse_nuomi(self, response):

        #只爬取美食类信息
        prdType = response.xpath(
            '//div[@class="w-bread-crumb"]//a[@href="/326"]/text()').extract()
        prdType = "".join(prdType).strip('\n')
        if prdType != u'美食':
            return

        items = []
        sel = response.xpath('//div[@class="p-item-info"]')
        dealId = sel.xpath('@mon').extract_first().split('=')[1]
        shopUrl = 'http://www.nuomi.com/pcindex/main/shopchain?dealId=' + dealId

        html = requests.get(shopUrl, headers=headers)

        js = json.loads(html.text)

        # shopCity = js['data']['city']['900010000']['city_name']

        for shop in js['data']['shop']:

            shopId = shop['merchant_id']
            shopCity = shop['city_id']
            #只获取重庆的美食信息
            # if shopId in self.visitedShop or shopCity != u'900010000':
            if shopId in self.visitedShop:
                continue
            else:
                self.visitedShop.add(shopId)
            shopName = shop['name']
            shopCity = js['data']['city'][shopCity]['city_name']
            shopAddr = shop['address']
            shopPhone = shop['phone']
            shopGlat = shop['baidu_latitude']
            shopGlng = shop['baidu_longitude']
            shopUrl = shop['link']
            shopPicSave = ''
            shopScrapWeb = 'nuomi'

            item = ShopIndexItem()
            item['shopId'] = shopId
            item['shopCity'] = shopCity
            item['shopName'] = shopName
            item['shopAddr'] = shopAddr
            item['shopPhone'] = shopPhone
            item['shopGlat'] = shopGlat
            item['shopGlng'] = shopGlng
            item['shopUrl'] = shopUrl
            item['shopPicSave'] = shopPicSave
            item['shopScrapWeb'] = shopScrapWeb

            items.append(item)
        return items

    def parse_dianping(self, response):
        sel = response.xpath('//div[@id="basic-info"]')

        #只爬取美食类信息, 有如上标记,判断为美食信息

        if not sel:
            print 'not meishi ' + response.url
            return

        shopId = re.search(r'/shop/([\d]+)$', response.url).group(1)

        if shopId in self.visitedShop:
            return
        else:
            self.visitedShop.add(shopId)

        shopCity = response.xpath(
            '//*[@id="page-header"]//a[@class="city J-city"]/text()'
        ).extract_first()
        shopName = sel.xpath('h1[@class="shop-name"]/text()').extract_first()
        shopAddr = sel.xpath(
            './/span[@itemprop="street-address"]/text()').extract_first()
        shopPhone = sel.xpath(
            './/span[@itemprop="tel"]/text()').extract_first()

        # shopDataUrl = 'http://www.dianping.com/ajax/json/shop/wizard/BasicHideInfoAjaxFP?shopId=%s'%shopId
        # htmlshop = requests.get(shopDataUrl, headers= headers)
        # try:
        #     shopJson = json.loads(htmlshop.text)
        #     shopInfo = shopJson['msg']['shopInfo']
        #     shopGlat = str(shopInfo['glat'])
        #     shopGlng = str(shopInfo['glng'])
        #
        # except (ValueError, KeyError, TypeError):
        #     print "JSON format error"
        shopInfo = ''
        lng = re.search(r'lng:([\d]+\.[\d]+)', response.body)
        lat = re.search(r'lat:([\d]+\.[\d]+)', response.body)
        shopGlat = ''
        shopGlng = ''
        if lng and lat:
            shopGlng = lng.group(1)
            shopGlat = lat.group(1)

        shopUrl = response.url
        shopPicSave = ''
        shopScrapWeb = 'dianping'

        item = ShopIndexItem()
        item['shopId'] = shopId
        item['shopCity'] = shopCity
        item['shopName'] = shopName.strip('\n').strip(' ').strip('\n')
        item['shopAddr'] = shopAddr.strip('\n').strip(' ').strip('\n')
        item['shopPhone'] = shopPhone
        item['shopGlat'] = shopGlat
        item['shopGlng'] = shopGlng
        item['shopUrl'] = shopUrl
        item['shopPicSave'] = shopPicSave
        item['shopScrapWeb'] = shopScrapWeb

        yield item

    def parse_meituan(self, response):
        sel = response.xpath('//div[@class="fs-section__left"]')

        # if not response.xpath('//div[@id="meishi-menu"]/h2[@class="content-title"]'):
        #     print 'not meishi ' + response.url
        #     return

        shopId = re.search(r'/shop/([\d]+)$', response.url).group(1)
        if shopId in self.visitedShop:
            return
        else:
            self.visitedShop.add(shopId)

        shopName = sel.xpath(
            './/h2/span[@class="title"]/text()').extract_first()
        shopAddr = sel.xpath('.//p/span[@class="geo"]/text()').extract_first()

        shopJson = json.loads(
            sel.xpath(
                './/p/span[@id="map-canvas"]/@data-params').extract_first())
        shopInfo = shopJson['shops'][shopId]
        shopPhone = shopInfo['phone']
        shopGlat = str(shopInfo['position'][0])
        shopGlng = str(shopInfo['position'][1])

        shopUrl = response.url
        shopPicSave = ''
        shopScrapWeb = 'meituan'

        item = ShopIndexItem()
        item['shopId'] = shopId
        item['shopCity'] = ''
        item['shopName'] = shopName.strip('\n').strip(' ').strip('\n')
        item['shopAddr'] = shopAddr.strip('\n').strip(' ').strip('\n')
        item['shopPhone'] = shopPhone
        item['shopGlat'] = shopGlat
        item['shopGlng'] = shopGlng
        item['shopUrl'] = shopUrl
        item['shopPicSave'] = shopPicSave
        item['shopScrapWeb'] = shopScrapWeb

        yield item
Example #25
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name' : "//div[@class='content_data']/div[@class='content_trangcon']/h1",
    'price' : "//div[@class='noidung_sanpham']/div[@id='sanpham']/div[@class='sanpham_mota']/div[@class='giaban_ct']",
    'category' : "",
    'description' : "//div[@class='content_data']/div[@class='content_trangcon']/div[@class='noidung_sanpham']/div",
    'images' : "//div[@class='img_noidung']/div[@id='load_IMG']/img/@src",
    'canonical' : "",
    'base_url' : "",
    'brand' : ""
}
name = 'luxy.vn'
allowed_domains = ['luxy.vn']
start_urls = ['http://luxy.vn']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+\.html']), 'parse_item'),
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+/+$']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Example #26
0
import config.config as config, config.database as db
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

process = CrawlerProcess({
    'USER_AGENT': config.Config.USER_AGENT,
    'DOWNLOAD_DELAY': 2,
    'RETRY_ENABLED': False,
    'COOKIES_ENABLED': False,
    'REDIRECT_ENABLED': False,
    'AJAXCRAWL_ENABLED': True
})

conn_string = "host='" + db.DatabaseConfig.DB_HOST + "' "
conn_string += "dbname='" + db.DatabaseConfig.DB_NAME + "' "
conn_string += "user='******'"
conn_string += "password='******'"
conn = psycopg2.connect(conn_string)

rules = (Rule(LinkExtractor(), callback='parse_item', follow=True), )

process.crawl(mySpider.wscrappingSpider,
              db=conn,
              allowed_domains=config.Config.DOMAINS_TO_SCRAP,
              keywords=config.Config.KEYWORDS_FILTER,
              start_urls=config.Config.URLS_TO_SCRAP,
              rules=rules)
process.start()

conn.close()
Example #27
0
class LagouSpider(CrawlSpider):
    name = 'lagou'
    allowed_domains = ['www.lagou.com']
    start_urls = ['https://www.lagou.com/']

    custom_settings = {
        "COOKIES_ENABLED": False,
        "DOWNLOAD_DELAY": 1,
        'DEFAULT_REQUEST_HEADERS': {
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.8',
            'Connection':
            'keep-alive',
            'Cookie':
            'user_trace_token=20171015132411-12af3b52-3a51-466f-bfae-a98fc96b4f90; LGUID=20171015132412-13eaf40f-b169-11e7-960b-525400f775ce; SEARCH_ID=070e82cdbbc04cc8b97710c2c0159ce1; ab_test_random_num=0; X_HTTP_TOKEN=d1cf855aacf760c3965ee017e0d3eb96; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DsXIrWUxpNGLE2g_bKzlUCXPTRJMHxfCs6L20RqgCpUq%26wd%3D%26eqid%3Dee53adaf00026e940000000559e354cc; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_hotjob; login=false; unick=""; _putrc=""; JSESSIONID=ABAAABAAAFCAAEG50060B788C4EED616EB9D1BF30380575; _gat=1; _ga=GA1.2.471681568.1508045060; LGSID=20171015203008-94e1afa5-b1a4-11e7-9788-525400f775ce; LGRID=20171015204552-c792b887-b1a6-11e7-9788-525400f775ce',
            'Host':
            'www.lagou.com',
            'Origin':
            'https://www.lagou.com',
            'Referer':
            'https://www.lagou.com/',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.7 Safari/537.36',
        }
    }

    rules = (
        Rule(LinkExtractor(allow=('zhaopin/.*')), follow=True),
        Rule(LinkExtractor(allow=('gongsi/j\d+.html.*')), follow=True),
        Rule(LinkExtractor(allow=r'jobs/\d+.html'),
             callback='parse_job',
             follow=True),
    )

    # def parse_start_url(self, response):
    #     return []
    #
    # def process_results(self, response, results):
    #     return results

    def parse_job(self, response):
        #解析拉勾网的职位
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request .salary::text")
        item_loader.add_xpath(
            "job_city", "/html/body/div[3]/div/div[1]/dd/p[1]/span[2]/text()")
        item_loader.add_xpath(
            "work_years",
            "/html/body/div[3]/div/div[1]/dd/p[1]/span[3]/text()")
        item_loader.add_xpath(
            "degree_need",
            "/html/body/div[3]/div/div[1]/dd/p[1]/span[4]/text()")
        item_loader.add_xpath(
            "job_type", "/html/body/div[3]/div/div[1]/dd/p[1]/span[5]/text()")
        item_loader.add_xpath(
            "tags", "/html/body/div[3]/div/div[1]/dd/ul/li/text()")  #缺
        item_loader.add_css("publish_time", ".publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div")
        item_loader.add_css("job_addr", '.work_addr a::text')
        item_loader.add_css("company_name", "#job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", "#job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.now())

        job_item = item_loader.load_item()
        return job_item
Example #28
0
class PlymouthSpider(CrawlSpider):
    name = 'falmouth_gd'
    allowed_domains = ['www.falmouth.ac.uk']
    start_urls = []
    base_url = '%s'

    Lists = ['http://flexible.falmouth.ac.uk/courses/ma-advertising-strategy-planning.htm',
'https://www.falmouth.ac.uk/communication-design-ma',
'https://www.falmouth.ac.uk/creativeadvertising',
'http://flexible.falmouth.ac.uk/courses/ma-creative-app-development.htm',
'http://flexible.falmouth.ac.uk/courses/ma-creative-events-management.htm',
'https://www.falmouth.ac.uk/film-television-ma',
'https://www.falmouth.ac.uk/illustrationma',
'https://www.falmouth.ac.uk/launchpad',
'https://www.falmouth.ac.uk/leasing-asset-finance',
'http://flexible.falmouth.ac.uk/courses/ma-photography.htm',
'http://flexible.falmouth.ac.uk/courses/pgche.htm',
'https://www.falmouth.ac.uk/professionalwriting',
'http://flexible.falmouth.ac.uk/courses/ma-writing-for-script-screen.htm',

]

    for i in Lists:
        fullurl = base_url % i
        start_urls.append(fullurl)

    rules = (
        Rule(LinkExtractor(allow=(r'.*'), restrict_xpaths=('//li[@class="item isotope-item"]/a')),follow=True),
        # Rule(LinkExtractor(allow=r''),follow=True),
        Rule(LinkExtractor(allow=r'.*'),callback='parse_item', follow=False),
    )

    def parse_item(self,response):
        print('==================================',response.url)
        item = HooliItem()

        url = response.url
        print(1,url)

        university = 'FALMOUTH UNIVERSITY'
        print(2,university)

        department = 'NULL'
        country = 'UK'
        city = 'NULL'
        website = 'https://www.falmouth.ac.uk'

        # programme = response.xpath('//div[@class="title"]/h1/text()').extract()
        programme = response.xpath('//div[@class="h1-box"]/h1/text()').extract()
        programme = ''.join(programme)
        print(3,programme)

        ucas_code = 'NULL'

        degree_level = '1'

        degree_type = response.xpath('//div[@class="h1-box"]/h1/text()').extract()
        degree_type = ''.join(degree_type)
        print(4,degree_type)

        start_date_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        start_date_str = ''.join(start_date_lists)
        if "Start dates and application deadlines" in start_date_str:
            sdstart = start_date_str.find("Start dates and application deadlines")
            sdend = start_date_str.find("News and Events")
            start_date = start_date_str[sdstart:sdend]
            item["start_date"] = start_date
        else:
            start_date = 'NULL'
        print(5,start_date)

        # overview = response.xpath('//div[@class="moduleWhite smallmargin"]//text()').extract()
        overview_list = response.xpath('//div[@class="content-block-wrapper"]//text()').extract()
        overview_str = ''.join(overview_list)
        if "Benefits" in overview_str:
            Ostart = overview_str.find("Benefits")
            Oend = overview_str.find("How the course is taught")
            overview = overview_str[Ostart:Oend]
            item["overview"] = overview
        else:
            overview = response.xpath('//div[@class="content-block-wrapper"]//text()').extract()
            overview = ''.join(overview)

        print(6, overview)

        mode = response.xpath('//div[@class="content-block-wrapper"]//dl//text()').extract()
        mode = ''.join(mode)
        # mode_lists = response.xpath('//div[@class="moduleWhite smallmargin"]//text()').extract()
        # mode_str = ''.join(mode_lists)
        # # mode = mode.replace('\n','')
        # # mode = mode.replace('      ','')
        # if "Mode of study:" in mode_str:
        #     mstart = mode_str.find("Mode of study:")
        #     mend = mode_str.find("Summary")
        #     mode = mode_str[mstart:mend]
        #     item["mode"] = mode
        # else:
        #     mode = ''
        print(7,mode)

        types = ''

        # duration_lists = response.xpath('//div[@class="moduleWhite smallmargin"]//text()').extract()
        duration = response.xpath('//div[@class="content-block-wrapper"]//dl//text()').extract()
        duration = ''.join(duration)
        # duration_str = ''.join(duration_lists)
        # # duration = duration.replace('\n','')
        # # duration = duration.replace('    ','')
        # if "Mode of study:" in duration_str:
        #     dstart = duration_str.find("Mode of study:")
        #     dend = duration_str.find("Duration:")
        #     duration = duration_str[dstart:dend]
        #     item["duration"] = duration
        # else:
        #     duration = ''
        print(8,duration)

        modules = response.xpath('//div[@class="accordion ui-accordion ui-widget ui-helper-reset"]//text()').extract()
        modules = ''.join(modules)
        # modules_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        # modules_str = ''.join(modules_lists)
        # if "Course content" in modules_str:
        #     mdstart = modules_str.find("Course content")
        #     mdend = modules_str.find("Assessments")
        #     modules = modules_str[mdstart:mdend]
        #     item["modules"] = modules
        # else:
        #     modules = ''
        # modules = modules.replace('\n','')
        print(9,modules)

        teaching = 'NULL'

        assessment = response.xpath('//div[@class="accordion"]//text()').extract()
        assessment = ''.join(assessment)
        # teaching_assessment_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        # teaching_assessment_str = ''.join(teaching_assessment_lists)
        # if "Assessments" in teaching_assessment_str:
        #     Astart = teaching_assessment_str.find("Assessments")
        #     Aend = teaching_assessment_str.find("How you study")
        #     teaching_assessment = teaching_assessment_str[Astart:Aend]
        #     item["teaching_assessment"] = teaching_assessment
        # else:
        #     teaching_assessment = ''
        print(10,assessment)

        career = response.xpath('//div[@class="field-career-opportunities"]//text()').extract()
        career = ''.join(career)
        print(11, career)

        application_date = 'NULL'

        deadline_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        deadline_str = ''.join(deadline_lists)
        if "Start dates and application deadlines" in deadline_str:
            dlstart  = deadline_str.find("Start dates and application deadlines")
            dlend = deadline_str.find("News and Events")
            deadline = deadline_str[dlstart:dlend]
            item["deadline"] = deadline
        else:
            deadline = 'NULL'
        print(11,deadline)



        application_fee = 'NULL'

        tuition_fee= 'NULL'
        # tuition_fee = ''.join(tuition_fee).replace('\r\n','')
        # tuition_fee = tuition_fee.replace('\n','')
        # tuition_fee = tuition_fee.replace('    ','')
        # print(11, tuition_fee)

        location = 'NULL'
        # location = ''.join(location)
        # print(13,location)

        ATAS = 'NULL'


        GPA = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        IELTS_str = ''.join(IELTS_lists)
        # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        if "Entry Requirements" in IELTS_str:
            Istart = IELTS_str.find("Entry Requirements")
            Iend = IELTS_str.find("Financing your studies")
            IELTS = IELTS_str[Istart:Iend]
            item["IELTS"] = IELTS
        else:
            IELTS = 'NULL'
        print(12, IELTS)

        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'


        GRE = 'NULL'

        GMAT = 'NULL'

        LSAT = 'NULL'
        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = response.xpath('//div[@class="field-selection-process"]//text()').extract()
        interview = ''.join(interview)
        print(13,interview)

        portfolio = response.xpath('//div[@class="field-selection-process"]//text()').extract()
        portfolio = ''.join(portfolio)
        print(14,portfolio)

        application_documents = 'NULL'

        how_to_apply_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        how_to_apply_str = ''.join(how_to_apply_lists)
        if "How to apply" in how_to_apply_str:
            hstart = how_to_apply_str.find("How to apply")
            hend = how_to_apply_str.find("Start dates and application deadlines")
            how_to_apply = how_to_apply_str[hstart:hend]
            item["how_to_apply"] = how_to_apply
        else:
            how_to_apply = 'NULL'
        print(13,how_to_apply)

        entry_requirements = response.xpath('//*[@id="start-of-content"]/div[2]/div[2]/div[1]//text()').extract()
        entry_requirements = ''.join(entry_requirements)
        
        # entry_requirements_lists = response.xpath('//div[@class="accordion"]//text()').extract()
        # entry_requirements_str = ''.join(entry_requirements_lists)
        # # EntryRequirements = EntryRequirements.replace(' ','')
        # if "Entry Requirements" in entry_requirements_str:
        #     Estart = entry_requirements_str.find("Entry Requirements")
        #     Eend = entry_requirements_str.find("Financing your studies")
        #     entry_requirements = entry_requirements_str[Estart:Eend]
        #     item["entry_requirements"] = entry_requirements
        # else:
        #     entry_requirements = ''
        print(14,entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        degree_description = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item

    def getTuition_fee(self, tuition_fee):
        allfee = re.findall(r'\d+,\d+', tuition_fee)
        # print(allfee)
        for index in range(len(allfee)):
            fee = allfee[index].split(",")
            allfee[index] = ''.join(fee)
            # print(allfee[index])
        # print(allfee)
        maxfee = 0
        for fee in allfee:
            if int(fee) >= maxfee:
                maxfee = int(fee)
        return maxfee
Example #29
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//h1[@class='product_title entry-title']",
    'price':
    "//div[@class='summary entry-summary']/div/p[@class='price']/span[@class='amount']",
    'category': "",
    'description': "//div/div[@class='summary entry-summary']/div",
    'images': "//div/div[@class='images']/a/img/@src",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'dartchocolate.com'
allowed_domains = ['dartchocolate.com']
start_urls = ['http://dartchocolate.com']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/product/']), 'parse_item'),
    Rule(LinkExtractor(allow=['']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
Example #30
0
class LapolarSpider(CrawlSpider):
    name = "lapolar"
    allowed_domains = ['lapolar.cl']
    start_urls = ('http://www.lapolar.cl/internet/catalogo/', )

    rules = (
        Rule(LinkExtractor(
            allow="http://www.lapolar.cl/internet/catalogo/detalle"),
             callback='parse_product',
             follow=True),
        Rule(LinkExtractor(allow=[
            'http://www.lapolar.cl/internet/catalogo/grupo',
            'http://www.lapolar.cl/internet/catalogo/categoria'
        ]),
             callback='parse',
             follow=True),
        Rule(LinkExtractor(
            allow='http://www.lapolar.cl/internet/catalogo/listados'),
             callback='parse_links',
             follow=True),
        Rule(LinkExtractor(
            allow='http://www.lapolar.cl/internet/catalogo/todolistados'),
             callback='parse_links',
             follow=True),
    )

    def parse_links(self, response):
        jsonlinks = response.xpath('//script[@language="javascript"]/text()'
                                   ).re('"ruta":"[a-z0-9/_]+')
        for link in jsonlinks:
            url = "http://www.lapolar.cl/internet/catalogo/detalles/" + link.replace(
                '"ruta":"', '')
            yield scrapy.Request(url, callback=self.parse_product)

    def parse_product(self, response):
        try:
            item = AliceItem()
            item['url'] = response.url

            try:
                item['title'] = response.xpath(
                    '//*[@class="titulo1 descrip_jq"]/text()').extract(
                    )[0].encode('ascii', 'ignore')
            except:
                item['title'] = response.xpath(
                    '//*[@class="titulo1 descrip_jq"]/text()').extract(
                    ).encode('ascii', 'ignore')

            try:
                item['picture'] = response.xpath(
                    '/html/head/meta[3]/@content').extract()[0]
            except:
                item['picture'] = response.xpath(
                    '/html/head/meta[3]/@content').extract()

            try:
                item['price'] = int(
                    response.xpath('//*[@class="precio precio_jq"]/text()').re(
                        '\d\S*')[0].replace('.', ''))
            except:
                item['price'] = int(
                    response.xpath('//*[@class="precio precio_jq"]/text()').re(
                        '\d\S*').replace('.', ''))

            item['brand'] = ""
            item['store'] = "lapolar"
            item['id_store'] = 4

            tags = response.xpath(
                '//tr[not(@id)]/td[@valign="top"]/div[@width]/a/text()'
            ).extract()

            try:
                item['tag1'] = tags[0]
            except:
                item['tag1'] = ""
            try:
                item['tag2'] = tags[1]
            except:
                item['tag2'] = ""
            try:
                item['tag3'] = tags[2]
            except:
                item['tag3'] = ""
            try:
                item['tag4'] = tags[3]
            except:
                item['tag4'] = ""
            try:
                item['tag5'] = tags[4]
            except:
                item['tag5'] = ""
            yield item
        except IOError:
            print 'cannot open', arg