Beispiel #1
0
    def parse(self, response):
        PrintLog.print_start_flag(self.parse.__name__)

        sel = Selector(response)

        # pdb.set_trace()
        # print response.url
        # print response.body

        # 取出friendsList
        '''
        下面的正则表达式要查找和取出字符串‘user_list : [...],’中间的...内容
        (?<=           # 断言要匹配的文本的前缀开始
        user_list : \[ # 查找字符串'user_list : ['
        )              # 前缀结束
        [\s\S]*        # 匹配任意文本
        (?=            # 断言要匹配的文本的后缀开始
        \],            # 查找字符串'[,'
        )              # 后缀结束
        '''
        friends = sel.re(r'(?<=user_list : \[)[\s\S]*(?=\],)')
        yield self.parse_friends_list(friends_list=friends)

        # 尝试取下一页数据
        #pdb.set_trace()
        PrintLog.print_log("get next page")
        page_count_str_list = sel.re(r'pageCount :\s*(.*)')
        if page_count_str_list:
            m = re.findall(r"\d", page_count_str_list[0])
            self.total_page_count = int(m[0])
            # print "page_count_num=", self.total_page_count
            self.page_num += 1  # 下一页码
            if self.page_num < self.total_page_count:
                yield self.request_page(page_idx=self.page_num)
    def parse(self, response):
        PrintLog.print_start_flag(self.parse.__name__)

        sel = Selector(response)

        # pdb.set_trace()
        # print response.url
        # print response.body

        # 取出friendsList
        '''
        下面的正则表达式要查找和取出字符串‘user_list : [...],’中间的...内容
        (?<=           # 断言要匹配的文本的前缀开始
        user_list : \[ # 查找字符串'user_list : ['
        )              # 前缀结束
        [\s\S]*        # 匹配任意文本
        (?=            # 断言要匹配的文本的后缀开始
        \],            # 查找字符串'[,'
        )              # 后缀结束
        '''
        friends = sel.re(r'(?<=user_list : \[)[\s\S]*(?=\],)')
        yield self.parse_friends_list(friends_list=friends)

        # 尝试取下一页数据
        #pdb.set_trace()
        PrintLog.print_log("get next page")
        page_count_str_list = sel.re(r'pageCount :\s*(.*)')
        if page_count_str_list:
            m = re.findall(r"\d", page_count_str_list[0])
            self.total_page_count = int(m[0])
            # print "page_count_num=", self.total_page_count
            self.page_num += 1 # 下一页码
            if self.page_num < self.total_page_count:
                yield self.request_page(page_idx=self.page_num)
Beispiel #3
0
     def parse(self, response):
         sel = Selector(response)

         item = MarketScrapyItem()
         #item['name'] = 'test'
         item['name'] = sel.re('id=\"mdname\"(.*?)</span')
         item['bus'] = sel.re('id=\"inforBody\".*')
         return item 
Beispiel #4
0
    def parse(self, response):
        sel = Selector(response)

        item = MarketScrapyItem()
        #item['name'] = 'test'
        item['name'] = sel.re('id=\"mdname\"(.*?)</span')
        item['bus'] = sel.re('id=\"inforBody\".*')
        return item
 def parse_article(self, response):
     item = WechartAccount()
     sel = Selector(response)
     user_name = sel.re('<span class="profile_meta_value">([^<]*)</span>')
     nickname = sel.re('var nickname = "([^"]*)";')
     image_url = sel.re('hd_head_img : "([^"]*)"')
     item['nickname'] = ''.join(nickname)
     item['user_name'] = user_name[0]
     item['image_url'] = ''.join(image_url)
     return item
Beispiel #6
0
 def parse_item(self, response):
     item = TupianItem()
     x = Selector(response)
     imgs = x.re('src="(http.*?\.jpg)".*?alt')
     imgname = x.re('src.*?alt="(.*?)"')
     for i in range(len(imgs)):
         item['tupianming'] = imgname[i]
         item['images_urls'] = [imgs[i]]
         item['leibie'] = x.xpath('//span/h1/text()').extract()
         yield item
Beispiel #7
0
 def parse_item(self, response):
     item = TianqiItem()
     x = Selector(response)
     rq_list = x.re('(\d+年\d+月\d+日)')
     tqzk_list = x.re('([\u4e00-\u9fa5]+\s+/[\u4e00-\u9fa5]+)')[::2]
     qw_list = x.re('\d+℃\s+/\s+-?\d+℃')
     for i in range(len(tqzk_list)):
         item['rq'] = rq_list[i]
         item['tqzk'] = re.sub('\s+', '', tqzk_list[i])
         item['qw'] = re.sub('\s+', '', qw_list[i])
         yield item
Beispiel #8
0
    def parse_product(self, response):
        self.response = response

        selector = Selector(response=response)

        ids = selector.re('"skuId":"(\w+)"')
        savings = selector.re('"SAVE":"(\d+)%"')
        options = selector.re('"Option":"(.+?)"')
        prices = selector.re(r'"RP":".(\d+\.\d{2})"')
        name = selector.re('productDisplayName="(.+?)"')

        items = self.load(prices, savings, ids, name[0], options)
        return items
 def parse_news(self, response):
     sel = Selector(response)
     pattern = re.match(self.url_pattern, str(response.url))
     
     item = SinaItem()
     item['source'] = 'sina' # pattern.group(1)
     item['date'] = ListCombiner(str(pattern.group(2)).split('-'))
     item['newsId'] = sel.re(r'comment_id:(\d-\d-\d+)')[0]
     item['cmtId'] = item['newsId']
     item['channelId'] = sel.re(r'comment_channel:(\w+);')[0]
     item['comments'] = {'link':str('http://comment5.news.sina.com.cn/comment/skin/default.html?channel='+item['channelId']+'&newsid='+item['cmtId'])}
     item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''}
     item['contents']['title'] = sel.xpath("//h1[@id='artibodyTitle']/text()").extract()[0]
     item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract())
     return item
 def parse_news(self, response):
     sel = Selector(response)
     pattern = re.match(self.url_pattern, str(response.url))
     datenow = GetDate()
     item = SinaItem()
     item['source'] = 'sina' # pattern.group(1)
     item['date'] = datenow
     item['newsId'] = sel.re(r'comment_id:(\d+-\d-\d+)')[0]
     item['cmtId'] = item['newsId']
     item['channelId'] = sel.re(r'comment_channel:(\w+);')[0]
     item['comments'] = {'link':str('http://comment5.news.sina.com.cn/page/info?format=json&channel='+item['channelId']+'&newsid='+item['cmtId']+'&group=0&compress=1&ie=gbk&oe=gbk&page=1&page_size=100&jsvar=requestId_24')}
     item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''}
     item['contents']['title'] = sel.xpath("//h1[@id='artibodyTitle']/text()").extract()[0]
     item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract())
     yield item
Beispiel #11
0
 def parse_item(response):
     sel = Selector(response)
     url = response.request.url
     if re.match(r'.*?people.com.cn.*?/\d+/\d+/.*?',
                 url) and 'BIG' not in url:
         content = response.xpath(
             '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()'
         ).extract()
         if content:
             item = NewsItem(
                 domainname='http://people.com.cn',
                 chinesename='人民网',
                 url=sel.root.base,
                 title=sel.css('div.text_title > h1::text').extract_first(),
                 subtitle=sel.css('.sub::text').extract_first(),
                 language='中文',
                 encodingtype='utf-8',
                 corpustype='网络',
                 timeofpublish=sel.re(r'\d{4}年\d{2}月\d{2}日\d{2}:\d{2}')[0],
                 content=''.join(content),
                 source=sel.css(
                     'div.box01 > div.fl > a::text').extract_first(),
                 author=sel.css('p.author::text').extract_first())
             item = judge_time_news(item)
             if item:
                 yield item
Beispiel #12
0
    def parse(self, response):
        selector = Selector(response)
        PageUrl = None
        #抓到下一页URL
        try:
            PageUrl = selector.re(
                "href=\"(http:.*\d)\" class=\"ui_page_item ui_page_next\"")[0]
        except IndexError:
            self.log('One done!' + response.url)
        #抓取本页的15个行程概览
        divitems = selector.xpath("//div[@class='items']").extract()
        for item in divitems:
            result = QYURLItem()
            journey_name_pattern = re.compile(u'<dd>(.*)</dd>')
            url_pattern = re.compile('<a href=\"(//.*)\" class.*>')
            day_pattern = re.compile('<strong>(\d*)</strong>')
            line_pattern = re.compile('<p>(.*)</p>')
            date_pattern = re.compile(r'<dt>(20.*) 出发</dt>')
            lable_pattern = re.compile('<strong>(\W*)</strong>')
            result['journey_name'] = journey_name_pattern.findall(item)
            result['url'] = url_pattern.findall(item)
            result['day'] = day_pattern.findall(item)
            result['line'] = line_pattern.findall(item)
            result['date'] = date_pattern.findall(item)
            result['lable'] = lable_pattern.findall(item)
            yield result

    # self.log(PageUrl, level=log.DEBUG)\
        '''
Beispiel #13
0
    def parse_item(self, response):
        """ Main parse function
        """
        sel = Selector(response)
        item = ProductItem()  

        item['source']  = 'tmall'       
        item['name']    = self.get_product_name( sel )        
        item['img']     = sel.xpath("//ul[@id='J_UlThumb']/li")[0].xpath(".//a/img/@src").extract()[0]

        item['category'] = self.get_category(response)
        
        try:
            # 获取TShop字符串,并对TShop字符串进行JSON标准化处理
            TShop_str = sel.re('TShop\.Setup\(((.|\n)+?)\);')[0]
            # 移除注释,目前只有天猫超市有注释,以逗号开头
            regex = re.compile(',\s*\/\/[^\n]*')
            TShop_str = re.sub(regex, ',', TShop_str)
            TShop = eval( TShop_str, type('Dummy', (dict,), dict(__getitem__=lambda s,n:n))() )      
        except SyntaxError:
            return  
        
        item['itemId']  = TShop.get('itemDO').get('itemId', '')
        item['url']     = 'http://detail.tmall.com/item.htm?id=' + item['itemId']
        item['date']    = date.today().strftime('%Y-%m-%d')
        item['attr'], item['brand'] = self.get_attr_and_brand( sel )
        
        skuMap = self.get_sku_chinese_map( sel, TShop )
        initApi_url = TShop.get('initApi')

        yield Request(  initApi_url, 
                        headers={'Referer': 'http://www.google.com.hk/'}, 
                        meta={'item': item, 'skuMap': skuMap}, 
                        callback=self.parse_initapi )
Beispiel #14
0
def search(q):
	'''根据书名检索馆藏信息
	'''
	
	fdata = {
		'tag': 'search',
		'subtag': 'simsearch',
		'gcbook': 'yes',
		'viewtype': '',
		'flword': '',
		'viewtype': '',
		'q': q
	}
	
	resp = requests.post(search_url, data=fdata)
	#得到记录条数
	s_res = Selector(text=resp.content.decode('utf-8')).xpath('//p[@id="page"]/span/text()')
	#如没有检索到记录,result_list为空
	result_list = s_res.extract()
	if len(result_list) == 0:
		return "没有检索到记录"
	result_str = result_list[0]
	num = int(s_res.re('[\d]+')[0])
	
	if num > 3:
		note = ""
		if num > 10:
			note = "\n注:只显示前10条结果,得到所有检索结果:" +  search_url + "\n======"
		return result_str + "\n======" + note + getManyLinks(resp, num)
	else:
		return result_str + "\n======" + getdetail(resp, num)
Beispiel #15
0
 def parse_product(self, response):
     sel = Selector(response)
     price = sel.re(re.compile('jsProductPrice = \'(.*)\';'))
     categories = sel.xpath('//div[@id="navBreadCrumb"]/a/text()')[1:].extract()
     brand = sel.xpath('//span[@class="product_manufacturer"]/text()').re('Manufactured by: (.*)')
     brand = brand[0].strip() if brand else ''
     sku = sel.xpath('//span[@class="product_model"]/text()').re('Ref: (.*)')
     sku = sku[0].strip() if sku else ''
     identifier = re.search('p-(.*)\.html', response.url).group(1)
     image_url = response.xpath('//div[@id="replace_image_zoom"]//img[@class="zoom_pic"]/@src').extract()
     if image_url:
         image_url = response.urljoin(image_url[0])
     name = sel.xpath('//h1[@class="productGeneral"]/text()').extract()
     loader = ProductLoader(item=Product(), response=response)
     loader.add_value('identifier', identifier)
     loader.add_value('sku', sku)
     loader.add_value('name', name)
     loader.add_value('price', price)
     price = loader.get_output_value('price')
     if price and Decimal(price) < Decimal('400.0'):
         loader.add_value('shipping_cost', Decimal('35.00'))
     loader.add_value('url', response.url)
     if image_url:
         loader.add_value('image_url', image_url)
     for category in categories:
         loader.add_value('category', category)
     loader.add_value('brand', brand)
     yield loader.load_item()
Beispiel #16
0
 def myre(self, response, key_list):
     date = Selector(response)
     for key in key_list:
         #llprint key
         if date.re(key):
             return 1
     return 0
Beispiel #17
0
class ChainReactionReviews(ChainReaction):
    name = 'chain-reaction-reviews'

    response = None
    selector = None
    item = None
    loader = None

    def _register(self, response):
        self.response = response
        self.selector = Selector(response=response)
        self.item = response.meta['item'] if 'item' in response.meta.keys() else Review()
        self.loader = ChainReactionReviewLoader(self.item, response=self.response)

    def parse_product(self, response):
        self._register(response)

        self.loader.add_value('slug', self.selector.re('productDisplayName="(.+?)"'))
        self.loader.add_value('name', self.selector.re('productDisplayName="(.+?)"'))
        self.loader.add_value('retailer', RETAILER)
        self.loader.add_value('manufacturer', MANUFACTURER)

        request = Request(response.url + '/reviews.djs?format=embeddedhtml', callback=self.parse_reviews)
        request.meta['item'] = self.loader.load_item()

        return request

    def parse_reviews(self, response):
        self._register(response)

        self.loader.add_value('review', 'review')
        self.loader.add_value('author', 'author')
        self.loader.add_value('date', 'date')

        return self.loader.load_item()
Beispiel #18
0
    def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        if re.match(r'.*?sina.com.*?/\d{4}-\d{2}-\d{2}/.*?', url):
            content = response.xpath(
                '//*[@id="artibody"]//p//text()').extract()
            # 移除编辑
            editor = response.xpath(
                '//*[@class="article-editor"]/text()').extract_first()
            if editor:
                content.remove(editor)
            publish_time = sel.re(r'\d{4}年\d{2}月\d{2}日.{0,1}\d{2}:\d{2}')[0]
            if ' ' in publish_time:
                publish_time = publish_time.replace(' ', '')

            if content:
                item = NewsItem(
                    domainname='http://sina.com.cn',
                    chinesename='新浪网',
                    url=sel.root.base,
                    title=sel.css('#artibodyTitle::text, #main_title::text'
                                  ).extract_first(),
                    subtitle=sel.css('.sub::text').extract_first(),
                    language='中文',
                    encodingtype='utf-8',
                    corpustype='网络',
                    timeofpublish=publish_time,
                    content=''.join(content),
                    source=sel.xpath(
                        '//*[@data-sudaclick="media_name"]/text() | //*[@data-sudaclick="media_name"]/a/text()'
                    ).extract_first(),
                    author=None)
                item = judge_time_news(item)
                if item:
                    yield item
Beispiel #19
0
 def myre(self,response,key_list):
 	date=Selector(response)
 	for key in key_list:
 		#llprint key
 		if date.re(key):
 			return 1
 	return 0
 def parse_item(response):
     sel = Selector(response)
     print(sel)
     url = response.request.url
     if re.match(r'.*?tibet.people.com.cn/.*?', url):
         print('---------------------')
         print(url)
         content = response.xpath('//html/body/div[2]/div[4]/div[1]/div[2]/div[2]/text()').extract()
                                # '//*[@id="rwb_zw"]/p/text() | //*[@id="rwb_zw"]/p/strong/text()'| //*[@id="content"]/p[2]/span/span/text()
         print(content)
         if content:
             item = NewsItem(
                 domainname='http://tibet.people.com.cn',
                 chinesename='people',
                 url=sel.root.base,
                 title=sel.css('.gq_content > h1:nth-child(2)::text').extract_first(),
                 subtitle=sel.css('.sub::text').extract_first(),
                 language='tibet',
                 encodingtype='utf-8',
                 corpustype='网络',
                 timeofpublish=sel.re(r'\d{4}.*?\d{2}.*?\d{2}.*?\d{2}:\d{2}')[0].replace('ལོའི་ཟླ་ ', '年').replace('ཚེས་', '月').replace('ཉིན།  ', '日'),
                 # timeofpublish = re.search(r'\d{4}.*?\d{2}.*?\d{2}',sel.css('.title_hd > p:nth-child(2)::text').extract_first()).group(0),
                 content=''.join(content),
                 # source=sel.css('.title_hd > p:nth-child(2)::text').extract_first(),
                 # author=sel.css('.title_hd > p:nth-child(2)::text').extract_first()
             )
             print(item.get("title", None))
             print(item.get("timeofpublish", None))
             print(item.get("source", None))
             print(item.get("author", None))
             item = judge_time_news_people(item)
             if item:
                 yield item
Beispiel #21
0
 def parse_item(self, response):
     items = DoubanItem()
     x = Selector(response)
     lb = x.xpath('//div[@class="hd"]/h1/text()').extract()[0]
     zz_yz = x.re('作者</span.*?</a></span></span></p>')
     x_k = x.xpath('//div[@class="info"]')
     sm = x_k.xpath('./div[@class="title"]/a/text()').extract()
     jj = x_k.xpath('./div[@class="article-desc-brief"]/text()').extract()
     for i in range(len(sm)):
         items['lb'] = lb
         items['sm'] = sm[i]
         zz_k = re.findall('作者</span.*?</a></span></span>', zz_yz[i])
         items['zz'] = re.findall('([〕〔\u4e00-\u9fa5·\s]{2,})', zz_k[0])[1:]
         yz_k = re.findall('译者</span.*?</a></span></span>', zz_yz[i])
         if not yz_k:
             items['yz'] = None
         else:
             items['yz'] = re.findall('([〕〔\u4e00-\u9fa5·]+)', yz_k[0])[1:]
         pf = x_k[i].xpath(
             './div/span[@class="rating-average"]/text()').extract()
         if not pf:
             items['pf'] = None
         else:
             items['pf'] = pf[0]
         items['jj'] = jj[i]
         yield items
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     #i['name'] = response.xpath('//div[@id="name"]').extract()
     #i['description'] = response.xpath('//div[@id="description"]').extract()
     return i
    def parse_post(self, response):
        sel = Selector(response)

        post_has_been_removed = sel.xpath(self.post_parse_rules['invalid'])
        if len(post_has_been_removed) > 0:
            return None

        title = sel.xpath(self.post_parse_rules['title'])
        # postinfo = sel.xpath("//div[@class='postinginfos']")
        date_posted = sel.xpath(self.post_parse_rules['date_posted'])
        date_updated = sel.xpath(self.post_parse_rules['date_updated'])
        userbody = sel.xpath(self.post_parse_rules['body'])
        text = sel.xpath(self.post_parse_rules['text'])
        pictures = sel.re(self.post_parse_rules['picture'])

        match = re.search("(\d+)\.\w+$", response.url)
        filename, pid = (match.group(0), match.group(1))

        item = PostItem()
        item['pid'] = pid
        item['filename'] = filename
        item['region'] = self.region
        item['domain'] = self.domain
        item['url'] = response.url
        item['body'] = response.body
        item['pictures'] = list(set(pictures)) # Eliminate duplicates
        item['title'] = title.extract()[0]
        item['date_posted'] = date_posted.extract()[0]
        if len(date_updated) > 0:
            item['date_updated'] = date_updated.extract()[0]
        item['userbody'] = userbody.extract()[0]
        item['text'] = text.extract()[0]

        return item
Beispiel #23
0
 def down_file(self,response):
     '''
     下载软件
     该请求返回一段js,拿出中的跳转路径,用curl进行下载
     :param response:
     :return:
     '''
     try:
         item = response.meta['item']
         sel = Selector(response);
         #匹配出软件下载的真正链接
         url = sel.re(r"window.location.href\s+=\s+\'([^\']+)")
         if url:
             #软件下载url是编过码的,替换里面的乱码,分割出软件名称
             otherurl = url[0].replace('%2','/')
             #软件文件的父路径
             parent_file = os.path.sep+settings.PARENT_FILE_NAME
             if not os.path.exists(parent_file):
                 os.mkdir(parent_file)
             #软件路径
             item['file_name'] = otherurl.split('/')[-1]
             filename = parent_file  +  os.path.sep  +  otherurl.split('/')[-1]
             #生成curl命令,-i代表断点续传,-o代表存储文件
             commond = 'curl -i -o ' + filename + ' ' + url[0]
             recode = subprocess.call(commond,shell=True)
             print 'successful!'
             self.col.update({'vendor': self.vendor}, {'$set': {'state': 'crawling'}})
             yield item
     except Exception as e:
         print e.message
         self.col.update({'vendor': self.vendor}, {'$set': {'state': 'error'}})
Beispiel #24
0
    def parseTmall(self, response):
        """ Tmall parser
		"""
        sel = Selector(response)
        item = ProductItem()

        item['surl'] = response.url
        item['source'] = 'tmall'
        item['name'] = self.get_product_name(sel)

        try:
            # 获取TShop字符串,并对TShop字符串进行JSON标准化处理
            TShop_str = sel.re('TShop\.Setup\(((.|\n)+?)\);')[0]
            # 移除注释,目前只有天猫超市有注释,以逗号开头
            regex = re.compile(',\s*\/\/[^\n]*')
            TShop_str = re.sub(regex, ',', TShop_str)
            TShop = eval(
                TShop_str,
                type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))())
        except SyntaxError:
            return

        item['itemId'] = TShop.get('itemDO').get('itemId', '')
        item['url'] = 'http://detail.tmall.com/item.htm?id=' + item['itemId']

        initApi_url = TShop.get('initApi')

        yield Request(initApi_url,
                      headers={'Referer': 'http://www.google.com.hk/'},
                      meta={'item': item},
                      callback=self.parse_initapi)
    def parse(self,response):
        """
        第一次解析时,由于是论坛主页,XPath解析规则不同,之后根据从主页上获取子
        论坛URL后,对每个子论坛的URL调用Parse
        """
        sel=Selector(response)
        items=[]
        if self.IndexFlag:
            self.IndexFlag=False
##由于使用BBS论坛页面原因,使用Xpath方式获取不是很好的选择
            urls=sel.re(r"(http://bbs2.99nets.me/forum.*html)")[1:-2]
            for url in urls:
                msg("------URL: %s -----" % url,level="DEBUG")
                yield Request(url,callback=self.parse)
            msg("-----Crawl END----",level="INFO")
        else:
            sites=sel.xpath("//form/table/tbody/tr/th/a[@class='s xst']")
            for site in sites:
                item=BbssecondItem()
                item['title']=site.xpath("text()").extract()
                item['link'] =site.xpath("@href").extract()
                yield item
            url,text=response.url,sel.xpath("//title/text()").extract()
            msg("---CURRENT URL:%s \n ---TEXT:%s" % (url,str(text).encode("UTF8")),
                    level='INFO')
            ##子论坛翻页
            rule="//div[@class='pg']/a[@class='nxt']/"
            if sel.xpath(rule+"text()").extract():
                yield Request(sel.xpath(rule+"@href").extract()[0]
                        ,callback=self.parse)
Beispiel #26
0
    def parse_item(response):
        sel = Selector(response)
        url = response.request.url
        if re.match(r'.*?sohu.com.*?/\d{4}\d{2}\d{2}/.*?', url):
            content = response.xpath(
                '//*[@itemprop="articleBody"]//p//text()').extract()
            # 有的段落并不是在p标签下,所以
            if len(content) < 3:
                content = response.xpath(
                    '//*[@itemprop="articleBody"]//p//text() | //*[@id="contentText"]//div/text()'
                ).extract()

            publish_time = sel.re(
                r'\d{4}-\d{2}-\d{2} {0,1}\d{2}:\d{2}:\d{2}')[0]
            if content:
                item = NewsItem(
                    domainname='http://sohu.com',
                    chinesename='搜狐网',
                    url=sel.root.base,
                    title=sel.xpath(
                        '//*[@itemprop="headline"]/text()').extract_first(),
                    subtitle=sel.css('.sub::text').extract_first(),
                    language='中文',
                    encodingtype='utf-8',
                    corpustype='网络',
                    timeofpublish=publish_time,
                    content=''.join(content),
                    source=sel.xpath(
                        '//*[@id="media_span"]/span/text()').extract_first(),
                    author=sel.xpath(
                        '//*[@id="author_baidu"]/text()').extract_first())
                item = judge_time_news(item)
                if item:
                    yield item
Beispiel #27
0
 def parse_page(self, response):
     print response.url
     sel = Selector(response)
     email = sel.re('(\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,6})')[0]
     print email
     if email not in email_in_file and email not in added_email:
         file.write(email+'\n')
         added_email.append(email)
 def parse_page(self, response):
     print response.url
     sel = Selector(response)
     email = sel.re('(\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,6})')[0]
     print email
     if email not in email_in_file and email not in added_email:
         file.write(email+'\n')
         added_email.append(email)
Beispiel #29
0
 def parse_item(self,response):
     sel= Selector(response)
     
     item = CommitsspiderItem()
     item['cve'] = sel.re(self.patt)
     if item['cve'] != []:
         item['url'] = response.url 
         yield item
     return
Beispiel #30
0
 def parse_page(self, response):
     sel = Selector(response)
     emails = sel.re('(\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,6})')
     emails = list(filter(lambda x: x != '*****@*****.**', emails))
     if bool(emails):
         for email in emails:
             if email not in email_in_file and email not in current_session_emails:
                 file.write(email+'\n')
                 current_session_emails.append(email)
Beispiel #31
0
 def parse(self, response):
     sel = Selector(response)
     link = sel.re('((http).*?(\.mp4))')[0]
     name = sel.xpath('//title/text()').extract()[0]
     items = []
     item = MaizieduCourseItem()
     item['link'] = link
     item['name'] = name
     items.append(item)
     return items
Beispiel #32
0
 def parse_item(self, response):
     response = Selector(response)
     itme = S80Item()
     itme['电影'] = response.xpath('//h1/text()').extract()
     itme['类型'] = response.xpath(
         '//span[@class="span_block"][1]/a/text()').extract()
     itme['演员'] = response.re('a href="/actor/.*?>([\u4e00-\u9fa5·]+)<')
     itme['地区'] = response.xpath(
         '//div[@class="clearfix"]/span[2]/a/text()')[0].extract()
     itme['语言'] = response.xpath(
         '//div[@class="clearfix"]/span[3]/a/text()').extract()
     itme['导演'] = response.xpath(
         '//div[@class="clearfix"]/span[4]/a/text()').extract()
     itme['片长'] = response.xpath(
         '//div[@class="clearfix"]/span[6]/text()').extract()
     itme['上映时间'] = response.xpath(
         '//div[@class="clearfix"]/span[5]/text()').extract()
     itme['下载链接'] = list(set(response.re('"(thunder://.*?)"')))
     return itme
Beispiel #33
0
 def parse_page(self, response):
     sel = Selector(response)
     email = sel.re('(\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,6})')
     if bool(email):
         email = email[0]
         if email + "\n" not in email_in_file and email not in added_email:
             file.write(email+'\n')
             added_email.append(email)
             print "Spider: Mandy. Email {0} added to file".format(email)
         else:
             print "Spider: Mandy. Email {0} already in the file".format(email)
Beispiel #34
0
 def get_asn(self, response):
     parent = response.meta['parent']
     base = parent.get('base')
     if base:
         sel = Selector(response)
         asn = ''.join(sel.re('([\d]+)\s+\<')) + ''.join(
             sel.xpath("//span[1]/text()").extract()).strip()
         print '______________________________'
         print asn
         base['asn'] = asn
     yield parent
Beispiel #35
0
    def parse_item(self, response):
        selector = Selector(response)

        # Use regex to find valid emails
        emails = selector.re(self.EMAIL_REGEX)
        if (emails):
            item = CompanyItem({'link': response.url})
            item['emails'] = emails

            return item

        return
Beispiel #36
0
 def parse_news(self, response):
     sel = Selector(response)
     pattern = re.match(self.url_pattern, str(response.url))
     
     item = NewsItem()
     item['source'] = 'news.sina.com.cn' # pattern.group(1)
     item['date'] = ListCombiner(str(pattern.group(2)).split('-'))
     item['newsId'] = sel.re(r'comment_id:(\d-\d-\d+)')[0]
     item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''}
     item['contents']['title'] = sel.xpath("//h1[@id='artibodyTitle']/text()").extract()[0]
     item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract())
     return item
Beispiel #37
0
    def parse_item(self, response):
        selector = Selector(response)

        # Use regex to find valid emails
        emails = selector.re(self.EMAIL_REGEX)
        if (emails):
            item = CompanyItem({'link': response.url})
            item['emails'] = emails

            return item

        return
Beispiel #38
0
 def parse_get_user(self,response):
     log.msg("parse_get_user: "******"id=(.*?)\\"')
         for user in users:
             url='http://www.weibo.com/'+user
             yield Request(url=url,cookies=self.login_cookie,callback=self.parse_user,meta={'url':url})
     except Exception, e:
         log.msg("Error for parse_get_user: " + response.url, level=log.ERROR)
         log.msg(str(e), level=log.ERROR)
 def parse_page(self, response):
     sel = Selector(response)
     emails = sel.re('(\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,6})')
     emails = list(filter(lambda x: x != '*****@*****.**', emails))
     if bool(emails):
         for email in emails:
             if email + "\n" not in email_in_file and email not in current_session_emails:
                 file.write(email+'\n')
                 current_session_emails.append(email)
                 print 'Spider: ProductionHub. Email {0} added to file'.format(email)
             else:
                 print 'Spider: ProductionHub. Email {0} already in the file'.format(email)
Beispiel #40
0
    def parseTmall(self, response):
        """ Tmall parser
		"""
        def _referer():
            referer = response.request.headers.get('Referer')
            if referer and referer.find('list.tmall.com') > -1:
                rto = 'http://list.tmall.com/search_product.htm?'
                resultC = re.compile('[\?&]cat=(\d+)').search(referer)
                if resultC: rto += 'cat=%s' % resultC.group(1)
                resultQ = re.compile('[\?&]q=([^&]+)').search(referer)
                if resultQ:
                    if resultC: rto += '&q=%s' % resultQ.group(1)
                    else: rto += 'q=%s' % resultQ.group(1)
                if not 'http://list.tmall.com/search_product.htm?' == rto:
                    return rto
            elif not referer and response.url.find('detail.tmall.com') > -1:
                return response.url
            return ''

        sel = Selector(response)
        item = ProductItem()

        item['source'] = 'tmall'
        item['name'] = self.get_product_name(sel)
        item['start_url'] = _referer()
        store = ''.join(
            sel.xpath('//input[@name="seller_nickname"]/@value').extract())
        item['tm_store'] = '[%s] %s' % (store[-3:],
                                        store) if len(store) > 3 else store

        try:
            # 获取TShop字符串,并对TShop字符串进行JSON标准化处理
            TShop_str = sel.re('TShop\.Setup\(((.|\n)+?)\);')[0]
            # 移除注释,目前只有天猫超市有注释,以逗号开头
            regex = re.compile(',\s*\/\/[^\n]*')
            TShop_str = re.sub(regex, ',', TShop_str)
            TShop = eval(
                TShop_str,
                type('Dummy', (dict, ), dict(__getitem__=lambda s, n: n))())
        except SyntaxError:
            return

        item['itemId'] = TShop.get('itemDO').get('itemId', '')
        item['url'] = response.url

        initApi_url = TShop.get('initApi')

        yield Request(initApi_url,
                      headers={'Referer': 'http://www.google.com.hk/'},
                      meta={'item': item},
                      dont_filter=True,
                      callback=self.parse_initapi)
Beispiel #41
0
 def parse(self, response):
     sel = Selector(response)
     products = json.loads("{"+sel.re("\"products\":\[.*\]")[0]+"}")
     items = []
     for product in products["products"]:
         #print(product)
         item = SephoraItem()
         item['display_name'] = product["display_name"]
         item['list_price'] = product["derived_sku"]["list_price"]
         item['brand_name'] = product["brand_name"]
         item['rating'] = product["rating"]
         items.append(item)
     return items
 def parse_news(self, response):
     sel = Selector(response)
     pattern = re.match(self.url_pattern, str(response.url))
     item = TencentItem()
     item['source'] = 'tencent' # pattern.group(1)
     item['date'] = pattern.group(2)
     item['newsId'] = pattern.group(3)
     item['cmtId'] = (sel.re(r"cmt_id = (.*);"))[0] # unicode string
     item['comments'] = {'link':str('http://coral.qq.com/')+item['cmtId']}
     item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''}
     item['contents']['title'] = sel.xpath('//h1/text()').extract()[0]
     item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract())
     return item
Beispiel #43
0
    def parse(self, response):
        self.driver.get(response.url)
        self.driver.execute_script("window.scrollTo(0, 1600)")
        self.driver.find_element_by_xpath(
            "//div[@class = 'index-showMoreText']").click()

        mount_root = Selector(text=self.driver.page_source)
        print(mount_root.re(r'url\("([^\")]+)'))
        row_keys = mount_root.xpath(
            "//*[@class = 'index-rowKey']/text()").getall()
        row_values = mount_root.xpath(
            "//*[@class = 'index-rowValue']/text()").getall()
        specifications = dict(zip(row_keys, row_values))
Beispiel #44
0
    def parse4(self, response):

        print("PARSE4 GO!!!")
        selector = Selector(response=response)

        li = selector.re(r'<ul class="listContent">(.*?)</ul>')
        '''遍历所有页数,判断li标签是否存在'''
        if not li:
            # 通过XPath获取详情页url信息
            url_pattern = r'//div[@class="content"]/div[1]/ul[@class="listContent"]/li/a/@href'
            url_list = selector.xpath(url_pattern).extract()

            print("======Redis连接信息:Host:{}  Port:{}======".format(
                self.settings.get('REDIS_HOST'),
                self.settings.get('REDIS_PORT')))

            # 将URL信息写入Redis数据库
            print('PARSE1 开始写入')
            for u in url_list:
                print("准备写入{}".format(u))
                self.r_link.rpush("Lianjia:detail_url", u)
                print("{}写入成功!".format(u))

            print('=' * 30, '\n', "共计写入url:{}个".format(len(url_list)), '\n',
                  '=' * 30)
        elif li[0] != '':
            # 通过XPath获取详情页url信息
            url_pattern = r'//div[@class="content"]/div[1]/ul[@class="listContent"]/li/a/@href'
            url_list = selector.xpath(url_pattern).extract()

            # 启用Redis服务
            r_link = redis.Redis(port=self.settings.get('REDIS_PORT'),
                                 host=self.settings.get('REDIS_HOST'),
                                 decode_responses=True,
                                 db=1)
            print("======Redis连接信息:Host:{}  Port:{}======".format(
                self.settings.get('REDIS_HOST'),
                self.settings.get('REDIS_PORT')))

            # 将URL信息写入Redis数据库
            print('PARSE4 开始写入')
            for u in url_list:
                print("准备写入{}".format(u))
                r_link.rpush("Lianjia:detail_url", u)
                print("{}写入成功!".format(u))

            print('=' * 30, '\n', "共计写入url:{}个".format(len(url_list)), '\n',
                  '=' * 30)
        elif li[0] == '':
            pass
Beispiel #45
0
 def parse_itme(self, response):
     print('1')
     x = Selector(response)
     names = x.xpath('//ul/li/b/text()').extract()
     leibie = x.re('小说分类:([\u4e00-\u9fa5]+)')
     links = LinkExtractor(allow=('/down/\d+.html'))
     link_list = links.extract_links(response)
     for link in link_list:
         yield Request(url=link.url,
                       callback=self.parse_itme1,
                       meta={
                           'name': names,
                           'leibie': leibie
                       })
Beispiel #46
0
	def parseTmall(self, response):
		""" Tmall parser
		"""

		def _referer():
			referer = response.request.headers.get('Referer')
			if referer and referer.find('list.tmall.com') > -1:
				rto = 'http://list.tmall.com/search_product.htm?'
				resultC = re.compile('[\?&]cat=(\d+)').search( referer )
				if resultC: rto += 'cat=%s' % resultC.group(1)
				resultQ = re.compile('[\?&]q=([^&]+)').search( referer )
				if resultQ: 
					if resultC: rto += '&q=%s' % resultQ.group(1)
					else: rto += 'q=%s' % resultQ.group(1)
				if not 'http://list.tmall.com/search_product.htm?' == rto:
					return rto
			elif not referer and response.url.find('detail.tmall.com') > -1:
				return response.url
			return ''

		sel = Selector(response)
		item = ProductItem()  

		item['source']  = 'tmall'       
		item['name']    = self.get_product_name( sel )  
		item['start_url'] = _referer()
		store = ''.join( sel.xpath('//input[@name="seller_nickname"]/@value').extract() )
		item['tm_store'] = '[%s] %s' % (store[-3:], store) if len(store) > 3 else store

		try:
			# 获取TShop字符串,并对TShop字符串进行JSON标准化处理
			TShop_str = sel.re('TShop\.Setup\(((.|\n)+?)\);')[0]
			# 移除注释,目前只有天猫超市有注释,以逗号开头
			regex = re.compile(',\s*\/\/[^\n]*')
			TShop_str = re.sub(regex, ',', TShop_str)
			TShop = eval( TShop_str, type('Dummy', (dict,), dict(__getitem__=lambda s,n:n))() )      
		except SyntaxError:
			return  

		item['itemId']  = TShop.get('itemDO').get('itemId', '')
		item['url']     = response.url

		initApi_url = TShop.get('initApi')

		yield Request(  initApi_url, 
		                headers={'Referer': 'http://www.google.com.hk/'}, 
		                meta={'item': item}, 
		                dont_filter=True,
		                callback=self.parse_initapi )
Beispiel #47
0
    def parse_url(self, response):
        hxs = Selector(response)
        download_url = hxs.re('jjvod_url = \'(.+)\';')

        meta = response.request.meta
        imgs = meta.get('imgs')
        title = meta.get('title')
        url = meta.get('url')

        item = CCUrlItem()
        item['url'] = url
        item['download_url'] = download_url
        item['title'] = title
        item['image_urls'] = imgs
        yield item
 def parse_news(self, response):
     sel = Selector(response)
     pattern = re.match(self.url_pattern, str(response.url))
     
     item = NeteaseItem()
     item['source'] = 'netease' # pattern.group(1)
     item['date'] = '20' + pattern.group(2) + pattern.group(3)
     item['newsId'] = pattern.group(4)
     item['cmtId'] = item['newsId']
     item['boardId'] = sel.re(r"boardId = \"(.*)\"")[0]
     item['comments'] = {'link':str('http://comment.news.163.com/'+item['boardId']+'/'+item['cmtId']+'.html')}
     item['contents'] = {'link':str(response.url), 'title':u'', 'passage':u''}
     item['contents']['title'] = sel.xpath("//h1[@id='h1title']/text()").extract()[0]
     item['contents']['passage'] = ListCombiner(sel.xpath('//p/text()').extract())
     return item
Beispiel #49
0
 def parse(self, response):
     selector = Selector(response)
     p1 = re.compile('<ul class="clearfix" style="display:none;">(.*?)</ul>', re.S)  # 这个正则表达式会找的7个匹配<ul></ul>
     rp1 = selector.re(p1)
     p2 = re.compile(u'<a data-id="\d*" data-key="\D*" href="(http://.*)">(.*)</a>')
     r = Redis()
     for index, i in enumerate(rp1):
         rp2 = p2.findall(i)
         for j in rp2:
             con_item = ContryItem()
             con_item['name'] = j[1]
             con_item['url'] = j[0]
             con_item['con_id'] = index+1 #continent ID fk
             Redis_utils.server.lpush('myspider:qyconpage_urls', j[0])
             yield con_item
Beispiel #50
0
    def parse(self, response):
        selector = Selector(response=response)
        results = selector.re(self.WIF_REGEX)
        #self.root_logger.debug("REGEX: " + str(results))
        for wif in results:
            if self.balance_checker.is_valid(wif):
                info = self.balance_checker.get_key_info(wif)
                if info.balance > 0:
                    self.root_logger.critical("CASH: " + str(info))
                    self.root_logger.critical("URL: " + str(response.url))
                    self.root_logger.critical("BODY: " + str(response.body))

        links = LinkExtractor(canonicalize=True,
                              unique=True).extract_links(response)
        for link in links:
            yield scrapy.Request(link.url, callback=self.parse)
    def parse_stop_data(self, response):
        """Parse stop data from a response where the "from" and "to"
        stops were selected in the request."""
        sel = Selector(response)

        stop_name = sel.css('#confirm1_hlFrom').xpath('text()').extract()[0]
        stop_location = sel.css('#divFrom').xpath('p[1]/text()').extract()[0]

        latlong = sel.re(r'Location\(([\d\.\-,]+)\)')[0]
        (stop_lat, stop_long) = latlong.split(',')

        return StopItem(
            stop_name=stop_name,
            stop_location=self.sanitize_stop_location(stop_location),
            lat=stop_lat,
            long=stop_long
        )
 def parse(self, response):
     sel = Selector(response)
     data = sel.re(r'\(([^)]+)\)')[0]
     try:
         data = json.loads(data)
     except Exception as e:
         print e
         self.log(e)
         print data
         self.log(data)
         print response.body
     url = urlparse.unquote(response.url)
     nums = re.search(r'"startPoint":{"latitude":(\d+.\d+),"longitude":(\d+.\d+)},"endPoint":{"latitude":(\d+.\d+),"longitude":(\d+.\d+)}',url).groups()
     slat, slong, elat, elong = [float(n) for n in nums]
     l = abs(slat - elat)
     w = abs(slong - elong)
     
     print 'EXPLORING:', (slat,slong), (elat,elong)
     self.log('EXPLORING: (%f, %f) (%f,%f)' %(slat,slong,elat,elong))
     
     quarters = [{'start':(slat, slong), 'wid':w/2, 'len':l/2}, 
                 {'start':(slat, slong + w/2), 'wid':w/2, 'len':l/2},
                 {'start':(slat - l/2, slong), 'wid':w/2, 'len':l/2},
                 {'start':(slat - l/2, slong + w/2), 'wid':w/2, 'len':l/2}]
     
     if not data:
         self.log('Failed: No data')
         return
         
     for point in data['pointDataList']:
         coord = point['coordinate']
         
         if point['count'] > 1:
             for i,a in enumerate(quarters):
                 if self.intersect(a, coord) and a['len'] >= self.minLen:
                     slat2, slong2 = a['start']
                     w2, l2 = a['wid'], a['len']
                     newUrl = self.getUrl((slat2,slong2), (slat2-l2, slong2+w2))
                     quarters.pop(i)
                     yield Request(newUrl)
                     break
                     
         else:
             celldata = point['cellData']
             yield Cell(ID=celldata['cellId'], networkID=celldata['mnc'], latitude=coord['latitude'], longitude=coord['longitude']) 
Beispiel #53
0
 def parse(self, response):
     #log.msg( response.body, level=log.INFO )
     sel = Selector(response)
     #1. all items in current page
     urls = sel.re('<a href="(http://finance.sina.com.cn/stock/jsy/\\d+/\\d+.shtml)" target="_blank">')
     for url in urls:
         log.msg( url, level=log.INFO )
         yield Request( url, callback=self.parse_item)
     #2. next page detect
     #pageBar = sel.css('#Main > div.listBlk > table:nth-child(1) > tbody > tr > td > div > span.pagebox_next')
     #pageBar = sel.xpath( '//div[@id="Main"]/div[3]/table[1]/tbody/tr/td/div')
     #pageBar = response.xpath( '//div[@id="Main"]/div[3]/table[1]/tbody/tr/td/div')
     pageBar = response.xpath('//span[@class="pagebox_next"]')
     if pageBar != None and len(pageBar) > 0 :
         pageTxt = pageBar.extract()[0]
         log.msg( 'matched txt:'+pageTxt, level=log.INFO )
         tail_url = self.reg_next_page.search( pageTxt )
         log.msg('NEXT PAGE: '+tail_url.group(1), level=log.INFO )
         yield Request( 'http://roll.finance.sina.com.cn/finance/zq1/gsjsy/'+tail_url.group(1), callback=self.parse )
Beispiel #54
0
    def parse_article(self, response):
        article = response.meta["article"]

        # We'll be using a regex for pub_time so we can't use selector
        # response shortcuts.
        sel = Selector(response=response)

        # TW adds a newline at the beginning of the title, so strip the string.
        article["title"] = sel.xpath(
                "//span[@class='titlea']/text()").extract_first().strip()

        # Publication date and time is grouped in a span together with the
        # number of times the article was accessed.  Therefore, use a regex to
        # extract just the date and time.  Matches eg.:
        # "04. July 2015. [17:16:00 UTC]"
        article["pub_time"] = sel.re(r"\d{2}\. \w+ \d{4}\. \[\d{2}:\d{2}:\d{2} UTC\]")[0]

        article["body"] = sel.xpath("//div[@class='articlebody']").extract_first()

        yield article
    def parse_item(self, response):
        sel = Selector(response)
        items = sel.xpath(TABLE_TR_XPATH)

        for item in items:
            problem_id = item.xpath(
                PROBLEM_ID_XPATH).extract()[0].strip()
            submit_time = item.xpath(
                SUBMIT_TIME_XPATH).extract()[0].split(' ')[0]

            self.solved[problem_id] = submit_time

        if not sel.re(RULE_REGEX):
            yield AccountItem(**dict(
                origin_oj=self.origin_oj,
                username=self.username,
                solved=self.solved
            ))
            raise CloseSpider('Crawl finished')

        return