def PageListParser(url):
    resp, cont = httplib21.request(url)
    if resp.fromcache: return nothing.Nothing()
    if resp.info().getmaintype() != 'text':
        raise NotImplementedError('we expect a text format, not the %s' % resp.info().gettype())
    charset = resp.info().getparam('charset') # or None
    docRoot = BS(cont, from_encoding=charset)
    return digest(parse(docRoot)[1], resp.geturl())
def PageContentParser(url):
        resp, cont = httplib21.request(url)
        if resp.info().getmaintype() == 'text':
            return ArticleExtractor(resp, cont)
        elif resp.info().gettype() == 'application/vnd.ms-excel':
            return excelParser(cont)
        elif resp.info().gettype() == 'application/msword':
            getMainContent = lambda self : u'再等等,或许下辈子我会看懂<a target="_blank" href="http://download.microsoft.com/download/0/B/E/0BE8BDD7-E5E8-422A-ABFD-4342ED7AD886/Word97-2007BinaryFileFormat%28doc%29Specification.pdf">word的格式</a>。'
            getTitlePrefix = lambda self : u'[DOC]'
            return type('MsWord', (), {'getMainContent':getMainContent, 'getTitlePrefix':getTitlePrefix})()
        else:
            raise TypeError('I have no idea how the %s is formatted' % resp.info().gettype())
    def calcArea(self):
        minpix = 55 # mind the side-bar pics
        height = self.bs_node.get('height', Nothing()).strip().rstrip('px')
        width = self.bs_node.get('width', Nothing()).strip().rstrip('px')

        # if you use percentage in height or width,
        # in most cases it cannot be the main-content
        if height.endswith('%') or width.endswith('%'):
            return 0
        try: height = int(height)
        except: height = 0
        try: width = int(width)
        except: width = 0

        if 0<height<=minpix or 0<width<=minpix:
            return 0

        if not (height and width):
            fp = cStringIO.StringIO()
            try:
                r, c = httplib21.request(self.bs_node['src'])
                fp.write(c)
                fp.seek(0)
                w, h = Image.open(fp).size
            except:
                h = w = 1.0
            finally:
                hdw = h/float(w) # we need float here
                if not (height or width):
                    height, width = h, w # no need to convert
                elif not height:
                    height = int(hdw*width)
                else:
                    width = int(hdw*height)
                fp.close()

        if height<=minpix or width<=minpix:
            return 0
        return width*height