コード例 #1
0
ファイル: dupeFilter.py プロジェクト: hw20686832/iCrawler
    def request_seen(self, request):
        if request.meta.has_key('spider'):
            spider = request.meta['spider']
            mission = request.meta.get('mission', ('', ''))
            seen = '#'.join((mission[1], request.url))

            if cs.zscore('crawled_url_%s' % spider, seen):
                cs.zadd('crawled_url_%s' % spider, seen, time.time())
                return True
コード例 #2
0
ファイル: pipelines.py プロジェクト: hw20686832/iCrawler
    def item_completed(self, results, item, info):

        spider = info.spider
        seen = '#'.join((item['father_url_number'], item['child_url']))
        if cs.zscore('crawled_url_%s' % spider.name, seen):
            raise DropItem("Duplicate item found: %s" % item)

        content = item['content']
        content_html = PyQuery(content)

        content_link = content_html('a')
        
        content_a = []

        if 'xw' in item['sum_mark'] or 'jr' in item['sum_mark']:
            #delete the <a
            content_html = content_html.html()
            alink1 = re.compile(u'<a.*?>')
            content_html = alink1.sub('',content_html)
            
            alink2 = re.compile(u'<\/a>')
            content_html = alink2.sub('',content_html)
            
            
            #delete the color
            content_html = PyQuery(content_html)
                    
            content_style = content_html('[style]')
            for style_d in content_style:
                if 'color' in content_style.eq(content_style.index(style_d)).attr('style') or 'COLOR' in content_style.eq(content_style.index(style_d)).attr('style'):
                    content_style.eq(content_style.index(style_d)).removeAttr('style')
            
            content_color = content_html('[color]')
            for color_d in content_color:
                    content_color.eq(content_color.index(color_d)).removeAttr('color')
            
        #delete the width height         
        img_d = content_html('img')
        for img in img_d:
            if not img.get('src'):
                continue
            elif '.gif' in img.get('src'):
                continue
            else:
                if img_d.eq(img_d.index(img)).attr('width'):
                    img_d.eq(img_d.index(img)).removeAttr('width')
                if img_d.eq(img_d.index(img)).attr('height'):
                    img_d.eq(img_d.index(img)).removeAttr('height')
                if img_d.eq(img_d.index(img)).attr('style'):
                    img_d.eq(img_d.index(img)).removeAttr('style')
        
        #delete the table width
        table_d = content_html('table')
        for tab in table_d:
            if table_d.eq(table_d.index(tab)).attr('width'):
                    table_d.eq(table_d.index(tab)).removeAttr('width')

        #delete the font center 
        div_d = content_html('div')
        for div in div_d:
            if div_d.eq(div_d.index(div)).attr('align'):
                    div_d.eq(div_d.index(div)).removeAttr('align')
        p_d = content_html('p')
        for p in p_d:
            if p_d.eq(p_d.index(p)).attr('style'):
                    p_d.eq(p_d.index(p)).removeAttr('style')
        #delete the link font
        link_d = content_html('link')
        for rel in link_d:
            if link_d.eq(link_d.index(rel)).attr('rel'):
                    link_d.eq(link_d.index(rel)).removeAttr('rel')
                    
        #delete the font strong
        content_html = content_html.html()
        strong1 = re.compile(u'<strong.*?>')
        content_html = strong1.sub('',content_html)
        
        strong2 = re.compile(u'<\/strong>')
        content_html = strong2.sub('',content_html)
        
        #delete the font
        font1 = re.compile(u'<font.*?>')
        content_html = font1.sub('',content_html)
        
        font2 = re.compile(u'<\/font>')
        content_html = font2.sub('',content_html)
        
        #delete the pre
        pre1 = re.compile(u'<pre.*?>')
        pre2 = re.compile(u'</pre>')
        pre3 = re.compile(u'<pre/>')
        
        content_html = pre1.sub('',content_html)
        content_html = pre2.sub('',content_html)
        content_html = pre3.sub('',content_html)

        #delete the center
        center1 = re.compile(u'<center.*?>')
        center2 = re.compile(u'</center>')
        center3 = re.compile(u'<center/>')
        
        content_html = center1.sub('',content_html)
        content_html = center2.sub('',content_html)
        content_html = center3.sub('',content_html)
        
        #delete the image background
        backg1 = re.compile(u'style="height:1px"')
        content_html = backg1.sub('',content_html)
        content_html = PyQuery(content_html)      

        # the center
        imgn = 1      
        img_d2 = content_html('img')
        for img in img_d2:
            if not img.get('src'):
                continue
            elif '.gif' in img.get('src'):
                continue
            elif '.GIF' in img.get('src'):
                continue
            else:
                if 'wb' not in item['sum_mark']:
                    img_d2.eq(img_d2.index(img)).wrap('<a href="woshi'+str(imgn)+'hao"><p align="center"></p></a>')
                    imgn = imgn+1
                else:
                    img_d2.eq(img_d2.index(img)).wrap('<p align="center"></p>')
                    imgn = imgn+1

                    
        # replace the img_url   
        n = 0
        img_d3 = content_html('img')
        for img in img_d3:
            if not img.get('src'):
                continue
            elif '.gif' in img.get('src'):
                n = n+1
                continue
            elif '.GIF' in img.get('src'):
                n = n+1
                continue
            else:
                if 'http' in img.get('src'):
                    continue
                else:
                    content = content_html.html()
                    content = content.replace(img.get('src'),item['image_urls'][img_d3.index(img)-n])
                    content_html = PyQuery(content)
        
        content = content_html.html()
        image_paths = [x['path'] for ok, x in results if ok]

        image_interval = settings.get('IMAGES_INTERVAL')
        def sub_image(thumb, full):
            pattern = re.compile('full/(.*?)\.jpg')
            path = pattern.sub('/images/%s/\\1.jpg?d=%s' % (thumb, image_interval), full)
            return path
        
        img_number = 1
        for ok, x in results:
            if ok:
                path = sub_image('thumbs/big', x['path'])
                full_path = sub_image('full', x['path'])
                content = content.replace('&amp;', '&')
                
                img_yuan = "woshi" +str(img_number)+ "hao"
                content = content.replace(img_yuan, full_path)
                content = content.replace(x['url'], path)
                img_number = img_number+1
        
        if not item['pic_url']:
            item['pic_url'] = sub_image('thumbs/small', image_paths[0]) if image_paths else None
            
        content_text = PyQuery(content)
        cont_text = content_text.text()
        if not cont_text:
            raise DropItem("This content is NULL: %s" % item['child_url'])
        item['content'] = content
        item['climb_time'] = time.time()

        jitem = json.dumps(item.withdict())
        cs.rpush('data_%s' % spider.name, jitem)
        cs.zadd('crawled_url_%s' % spider.name, seen, time.time())

        return item