def get_raw_info(html):
    if not isinstance(html, unicode):
        return '', '', ''
    title = ''.join(re.findall(RE_TITLE,
                               html)).strip()  # + re.findall(RE_H, html)

    #cc filt title from RE_TITLE
    title = tf.filt(title)

    html = re.sub(ur"(?is)</a><a", '</a> <a', html)
    h = re.findall(RE_H, html)
    for ht in h:
        ht = tf.remove(ht)
        ht = tf.filt(ht)
        ht = ht.strip()
        if ht == '': continue
        if title.lower().startswith(ht.lower()) and len(ht.split(' ')) > 2:
            title = ht
            break
        #cc compare string size, choose longer
        if len(title.split(' ')) < len(ht.split(' ')):
            title = ht
            break

    for k, v in RE_IGNORE_BLOCK.iteritems():
        html = re.sub(v, '', html)
    for k, v in RE_NEWLINE_BLOCK.iteritems():
        html = re.sub(v, '\n', html)
    html = re.sub(RE_MULTI_NEWLINE, '\n', html)

    return html_util.unescape(title), html_util.unescape(html)
Beispiel #2
0
def get_main_content(html, bodyHtml, webInfo):
    if not isinstance(html, unicode): return '', '', '', {}
    title = ''.join(re.findall(RE_TITLE,
                               html)).strip()  # + re.findall(RE_H, html)
    html = re.sub(ur"(?is)</a><a", '</a> <a', html)
    h = re.findall(RE_H, html)
    for ht in h:
        ht = ht.strip()
        if ht == '': continue
        if title.startswith(ht):
            title = ht
            break
    title = html_util.unescape(title)
    text = re.sub(RE_TAG, '', html)
    # 抽取发表时间
    time = ''
    t_time = re.findall(RE_TIME, text)
    if len(t_time) > 0:
        time = t_time[0]
    date = ''
    t_date = re.findall(RE_DATETIME, text)
    if len(t_date) > 0:
        date = t_date[0][0]
    images, text = HtmlTagStrip(bodyHtml, webInfo)
    return title, strtotime(date, time), text, images
Beispiel #3
0
def get_raw_info(html):
    if not isinstance(html, unicode):
        return '','',''
    title = ''.join(re.findall(RE_TITLE, html))# + re.findall(RE_H, html)
    html = re.sub(ur"(?is)</a><a",'</a> <a',html)
    h = re.findall(RE_H, html)
    for ht in h:
        ht = ht.strip()
        if ht == '': continue
        if title.startswith(ht):
            title = ht
            break
    for k,v in RE_IGNORE_BLOCK.iteritems():
        html = re.sub(v, '', html)
    for k,v in RE_NEWLINE_BLOCK.iteritems():
        html = re.sub(v, '\n', html)
    html = re.sub(RE_MULTI_NEWLINE, '\n', html)
    
    return html_util.unescape(title.strip()), html_util.unescape(html)
Beispiel #4
0
def get_raw_info(html):
    if not isinstance(html, unicode):
        return '', '', ''
    title = ''.join(re.findall(RE_TITLE, html))  # + re.findall(RE_H, html)
    html = re.sub(ur"(?is)</a><a", '</a> <a', html)
    h = re.findall(RE_H, html)
    for ht in h:
        ht = ht.strip()
        if ht == '': continue
        if title.startswith(ht):
            title = ht
            break
    for k, v in RE_IGNORE_BLOCK.iteritems():
        html = re.sub(v, '', html)
    for k, v in RE_NEWLINE_BLOCK.iteritems():
        html = re.sub(v, '\n', html)
    html = re.sub(RE_MULTI_NEWLINE, '\n', html)

    return html_util.unescape(title), html_util.unescape(html)
def get_datetime(html, title):
    # 获取title之后的内容
    #title, tmp = get_raw_info(html)
    bodytext = re.sub(RE_HEAD, '', html)
    titlepos = (html_util.unescape(bodytext)).find(title)
    #print "titlepos:%s"%titlepos
    if titlepos >= 0:
        bodytext = bodytext[titlepos:]
    #print "bodytext:%s"%bodytext[:1000]

    #从title之后开始抽取时间
    time = ''
    t_time = re.findall(RE_TIME, bodytext)
    if len(t_time) > 0:
        time = t_time[0]
    time = time.replace('.', ':', 1)

    date = ''
    t_date = re.findall(RE_DATETIME, bodytext)
    if len(t_date) > 0:
        date = t_date[0][0]

    return strtotime(date, time)
Beispiel #6
0
def supplesubtitleimages(url, html, text, title):
    if text == '': return text
    imagetext = ''

    try:
        bodytext = re.sub(RE_HEAD, '', html)

        #subtitle
        titlepos = (html_util.unescape(bodytext)).find(title)
        if titlepos >= 0:
            bodytext = bodytext[titlepos:]
        #print "bodytext:%s"%bodytext[:1000]

        #upcontent
        contentpos = -1
        text1 = re.sub(RE_TAG, '', text)
        if len(text1) > 30:
            lastcontent = text1[len(text1) - 30:]
            contentpos = (html_util.unescape(bodytext)).find(lastcontent)
        if contentpos > 0:
            bodytext = bodytext[:contentpos]

        #domain
        domain = ''
        try:
            domain = get_tld(url)
            #print "domain:%s"%domain
        except Exception, e:
            print e
            pass

        #words
        keywords = ['.jpg', '.gif', '.jpeg']
        filtwords = ['thumb', 'twitter', 'facebook']

        images = re.findall(RE_IMG_SRC, bodytext)
        for image in images:
            #domain
            if image.find(domain) < 0 and image.lower().find('news') < 0 \
                    and image.find('intoday') < 0 and image.startswith('/') == False:
                continue

            #one keyword at least
            bfind = False
            for keyword in keywords:
                if image.lower().find(keyword) != -1:
                    bfind = True
                    break
            if bfind == False:
                continue

            # no filtword
            bfind = False
            for filtword in filtwords:
                if image.lower().find(filtword) != -1:
                    bfind = True
                    break
            if bfind:
                continue

            if image.lower().startswith('/'):
                if domain.find('http://') == -1:
                    image = 'http://' + domain + image
                else:
                    image = domain + image

            imagetext += '<img from=\"subtitle\" src=\"' + image + '\" />\n'
            break