コード例 #1
0
ファイル: update.py プロジェクト: yantoumu/Beebeeto2
def get_onepage_poclist(page):
    info = getHtml("http://beebeeto.com/pdb" + '/?page=' + str(page))
    if '' == info:
        return ''

    bt = BeautifulSoup(info)
    end = bt.find(
        'a', {
            'style':
            "font-size: 20px;font-weight: bold; border-bottom: 3px solid #777777;"
        })
    if '1' == end.renderContents() and page != 1:
        return ''

    ret = bt.find('div', {'class': 'mainlist'})
    ret = ret.renderContents()
    if ret == "":
        return ""

    retlist = []
    rets = re.findall('<a href=.*?>', ret)
    for one in rets:
        if "poc-" in one:
            one = one.replace('<a href="', "")
            one = one.replace('">', "")
            one = one.strip()
            retlist.append(one)

    return retlist
コード例 #2
0
def strip_html_and_tags(s, invalid_tags):
    '''
    content between "invalid_tags" is removed
    '''
    if not s: return s

    from util.BeautifulSoup import BeautifulSoup
    soup = BeautifulSoup(
        s.replace('<br>', '\n').replace('<br/>', '\n').replace('<br />', '\n'))
    for tag in invalid_tags:
        for result in soup.findAll(name=tag):
            result.replaceWith("")

    return ''.join(e for e in soup.recursiveChildGenerator()
                   if isinstance(e, unicode))
コード例 #3
0
ファイル: update.py プロジェクト: yantoumu/Beebeeto2
def getPoc(poc):
    info = getHtml("http://beebeeto.com/pdb/" + poc + "/")
    if '' == info:
        return ''
    if '<img src="/static/img/test.jpg"' in info:
        return ''
    if "立即查看" in info:
        return ''

    try:
        bt = BeautifulSoup(info)
        ret = bt.find('pre', {'class': "brush: python;"})
        ret = ret.renderContents()
        if ret:
            return strip_tags(ret)
        else:
            return ''
    except:
        return ''
コード例 #4
0
def strip_html2(s):
    '''
    Strips out HTML with the BeautifulSoup library.

    >>> strip_html2('<html><body><b>Some <i>ugly</i></b> html.</body></html>')
    u'Some ugly html.'
    '''
    if not s: return s

    from util.BeautifulSoup import BeautifulSoup
    soup = BeautifulSoup(s)

    text_pieces = []
    for pc in soup.recursiveChildGenerator():
        if isinstance(pc, unicode):
            text_pieces.append(pc)
        elif pc.name == 'br':
            text_pieces.append('\n')

    return ''.join(text_pieces)
コード例 #5
0
ファイル: directim.py プロジェクト: sgricci/digsby
    def odc_body_received(self, data):
        info('odc_body_received')

        # Get a place to store the images.
        import stdpaths
        assetdir = stdpaths.userdata

        # Did the message include an inline image?
        if '<BINARY>' in data:
            j = data.find('<BINARY>')

            # Parse the HTML _before_ <BINARY>
            soup = BeautifulSoup(data[:j])
            for img in soup.html.body('img'):  # may have more than one <img>

                # For each <IMG> tag
                imgdata = data[j:]
                findme = ' ID="%s" SIZE="%s">' % (str(
                    img['id']), str(img['datasize']))
                i = imgdata.find(findme)
                imgbytes = imgdata[i + len(findme):int(img['datasize']) + 33]

                # os.path.split the img src, because some clients send their
                # full paths. (file:///c:/blah.jpg)
                imgpath = os.path.join(assetdir, os.path.split(img['src'])[1])

                img['src'] = imgpath
                del img['width']
                del img['height']

                with open(imgpath, 'wb') as f:
                    f.write(imgbytes)

            msg = unicode(soup.html)
        else:
            msg = data

        self.convo.incoming_message(self.screenname, msg)
        self.socket.receive_next(ODCHeader, self.odc_header_received)