Beispiel #1
0
def ssdut_news_parse(raw):
    ''' parse the raw page src,

    store all result in a Storage object.
    all strings are unicode

    result.soup
        BeautifulSoup object
    result.raw
        raw page src
    result.hash
        sha1 hash of the page
    result.title
        title
    result.source
        来源
    result.date_str - date in string
    result.date - date object
    result.body
        html src of the news body
    result.clean_body
        unescaped src of the news body,
    result.publisher
        发表人
    '''
    soup = bsoup(raw)
    result = Storage()

    # raw page / hash
    result.raw = raw
    result.soup = soup

    # title
    s = soup.find(attrs={'class': re_compile('title')})
    result.title = s.text

    # source
    text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn"))
    r = re_compile(ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:")
    res = r.findall(text)[0]
    result.source = res[1].rstrip()

    # date
    result.date_str = res[0]
    result.date = datetime.date(*[int(n) for n in result.date_str.split('-')])

    # content (body)
    c = soup.find(attrs={'class': re_compile('content')})
    result.body = unicode(c)

    # content (body)  unescaped
    texts = c.findAll(text=True)
    all_texts = '\n'.join(texts)
    result.clean_body = html_parser.unescape(all_texts)

    # publisher (could be find at the bottom of page)
    s = soup.find(
        attrs={
            "style": "font-size:14px;float:left;text-align:right;width:80%"
        })
    r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)")
    #logging.debug("publisher string = %r " % s)

    try:
        name = r.findall(s.text)[0]
    except:
        logging.warn(" %s has no publisher " % result.title)
        name = ""  # no publisher: like this: index.php/News/8692.html
    result.publisher = name.rstrip().lstrip()

    # use utf-8 encoding
    for k in ['title', 'source', 'body', 'clean_body', 'publisher']:
        result[k] = result[k].encode('utf-8')

    hash_src = result.body + result.title + result.publisher
    if isinstance(hash_src, str):
        hash_src = unicode(hash_src, "utf-8", "ignore")
    elif isinstance(hash_src, unicode):
        pass
    else:
        pass
    result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest()
    result.search_text = ''.join([
        result.title, result.source, result.clean_body, result.publisher,
        result.sha1
    ])
    return result
Beispiel #2
0
def ssdut_news_parse(raw):
    ''' parse the raw page src,

    store all result in a Storage object.
    all strings are unicode

    result.soup
        BeautifulSoup object
    result.raw
        raw page src
    result.hash
        sha1 hash of the page
    result.title
        title
    result.source
        来源
    result.date_str - date in string
    result.date - date object
    result.body
        html src of the news body
    result.clean_body
        unescaped src of the news body,
    result.publisher
        发表人
    '''
    soup = bsoup(raw)
    result = Storage()

    # raw page / hash
    result.raw = raw
    result.soup = soup

    # title
    s = soup.find(attrs={'class': re_compile('title')})
    result.title = s.text

    # source
    text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn"))
    r = re_compile(
        ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:")
    res = r.findall(text)[0]
    result.source = res[1].rstrip()

    # date
    result.date_str = res[0]
    result.date = datetime.date(*[int(n) for n in result.date_str.split('-')])

    # content (body)
    c = soup.find(attrs={'class': re_compile('content')})
    result.body = unicode(c)

    # content (body)  unescaped
    texts = c.findAll(text=True)
    all_texts = '\n'.join(texts)
    result.clean_body = html_parser.unescape(all_texts)

    # publisher (could be find at the bottom of page)
    s = soup.find(
        attrs={
            "style": "font-size:14px;float:left;text-align:right;width:80%"
        })
    r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)")
    #logging.debug("publisher string = %r " % s)

    try:
        name = r.findall(s.text)[0]
    except:
        logging.warn(" %s has no publisher " % result.title)
        name = ""  # no publisher: like this: index.php/News/8692.html
    result.publisher = name.rstrip().lstrip()

    # use utf-8 encoding
    for k in ['title', 'source', 'body', 'clean_body', 'publisher']:
        result[k] = result[k].encode('utf-8')


    hash_src = result.body + result.title + result.publisher
    if isinstance(hash_src, str):
        hash_src = unicode(hash_src, "utf-8", "ignore")
    elif isinstance(hash_src, unicode):
        pass
    else:
        pass
    result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest()
    result.search_text = ''.join([result.title, result.source,
                                  result.clean_body, result.publisher,
                                  result.sha1])
    return result