def PageListParser(url):
    resp, cont = httplib21.request(url)
    if resp.fromcache: return nothing.Nothing()
    if != 'text':
        raise NotImplementedError('we expect a text format, not the %s' %
    charset ='charset') # or None
    docRoot = BS(cont, from_encoding=charset)
    return digest(parse(docRoot)[1], resp.geturl())
def PageContentParser(url):
        resp, cont = httplib21.request(url)
        if == 'text':
            return ArticleExtractor(resp, cont)
        elif == 'application/':
            return excelParser(cont)
        elif == 'application/msword':
            getMainContent = lambda self : u'再等等,或许下辈子我会看懂<a target="_blank" href="">word的格式</a>。'
            getTitlePrefix = lambda self : u'[DOC]'
            return type('MsWord', (), {'getMainContent':getMainContent, 'getTitlePrefix':getTitlePrefix})()
            raise TypeError('I have no idea how the %s is formatted' %
    def calcArea(self):
        minpix = 55 # mind the side-bar pics
        height = self.bs_node.get('height', Nothing()).strip().rstrip('px')
        width = self.bs_node.get('width', Nothing()).strip().rstrip('px')

        # if you use percentage in height or width,
        # in most cases it cannot be the main-content
        if height.endswith('%') or width.endswith('%'):
            return 0
        try: height = int(height)
        except: height = 0
        try: width = int(width)
        except: width = 0

        if 0<height<=minpix or 0<width<=minpix:
            return 0

        if not (height and width):
            fp = cStringIO.StringIO()
                r, c = httplib21.request(self.bs_node['src'])
                w, h =
                h = w = 1.0
                hdw = h/float(w) # we need float here
                if not (height or width):
                    height, width = h, w # no need to convert
                elif not height:
                    height = int(hdw*width)
                    width = int(hdw*height)

        if height<=minpix or width<=minpix:
            return 0
        return width*height