Example #1
0
def net_test(config_path='config.xml'):
    print "net_test ==>  "

    html = get_html_from_url(
        "http://news.163.com/13/0727/16/94Q724990001124J.html")

    charset = get_charset_of_html(html)
    print "ifeng charset: ", charset
    doc = HTML.fromstring(html.decode(charset, 'ignore'))

    ss = StringIO('')
    #doc = HTML.fromstring(html.decode('utf8', 'ignore'))
    #doc = HTML.fromstring(html)
    result = doc.xpath("//div[@class='left']")
    print "len ", len(result)

    for ret in result:
        value = ret.text_content().strip()
        value = re.sub('[\r\n]', '', value)
    print value

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间:", re.search(regx, value).groups(0)[0]

    #regx = u'来源[::\s]*([^\s]*)[有]*'
    #print "来源:", re.search(regx, value).groups(0)[0].strip()
    #regx = u'\d|-|:|(有.*)'

    regx = u'来源[::\s]*([^\s]+)有.*'

    print "来源:", re.search(regx, value).groups(0)[0].strip()
Example #2
0
def ifeng_test(config_path='config.xml'):
    print "sohu_test ==>  "

    html = get_html_from_url(
        "http://news.ifeng.com/society/1/detail_2013_07/27/27973995_0.shtml")
    print html[:800]

    charset = get_charset_of_html(html)
    print "ifeng charset: ", charset
    doc = HTML.fromstring(html.decode(charset, 'ignore'))

    ss = StringIO('')
    #doc = HTML.fromstring(html.decode('utf8', 'ignore'))
    #doc = HTML.fromstring(html)
    result = doc.xpath("//div[@id='artical_sth']/p")
    print "len ", len(result)

    for ret in result:
        value = ret.text_content().strip()
        value = re.sub('[\r\n]', '', value)
    print value

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间:", re.search(regx, value).groups(0)[0]

    regx = u'来源[::]*([^\s]*)[\s]*'
    print "来源:", re.search(regx, value).groups(0)[0].strip()

    xpath_cont = "//div[@id='main_content']/p"
    conts = doc.xpath(xpath_cont)
    if conts is not None:
        for cont in conts:
            value = cont.text_content().strip()
            print value
Example #3
0
def parse_config(config_path="config.xml"):
    print "parse_xml ==>  "

    html = get_html_from_url(
        "http://finance.sina.com.cn/china/20130727/015816259014.shtml")

    #print html[:600]

    ss = StringIO('')

    charset = get_charset_of_html(html)
    doc = HTML.fromstring(html.decode(charset, 'ignore'))

    #doc = HTML.fromstring(html)
    result = doc.xpath('//div[@class="artInfo"]')
    #result = doc.xpath("//div[@id='artibody']/p")
    print "len ", len(result)
    for ret in result:
        value = ret.text_content().strip()

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间:", re.search(regx, value).groups(0)[0]

    print value
    regx = u':\d{2}([^\s]*)[\s]*'
    print "来源:", re.search(regx, value).groups(0)[0].strip()
Example #4
0
def get_and_parse_herf_list(herf_list):
    print "get_and_parse_herf_list->"
    for herf in herf_list:
        html = get_html_from_url(herf)
        tree = etree.HTML(html.decode('gbk', 'ignore'))
        try:
            # 获取主题
            title = tree.xpath(xpath_title)[0].text

            # 获取时间
            pub_time_raw = tree.xpath(xpath_pub_time)[0].text
            for match in re.finditer(regx_pub_time, pub_time_raw):
                pub_time = match.group()

            # 获取来源
            source = tree.xpath(xpath_source)[0].text

            #  主体内容
            content = StringIO.StringIO("")
            for section in tree.xpath(xpath_content):
                content.write(section.text)
        except:
            pass
        
        print "herf: ", herf
        print "title: ", title
        print "pub_time: ", pub_time
        print "source: ", source
        print "content: ", content.getvalue()
Example #5
0
def get_and_parse_herf_list(herf_list):
    print "get_and_parse_herf_list->"
    for herf in herf_list:
        html = get_html_from_url(herf)
        tree = etree.HTML(html.decode("gbk", "ignore"))
        try:
            # 获取主题
            title = tree.xpath(xpath_title)[0].text

            # 获取时间
            pub_time_raw = tree.xpath(xpath_pub_time)[0].text
            for match in re.finditer(regx_pub_time, pub_time_raw):
                pub_time = match.group()

            # 获取来源
            source = tree.xpath(xpath_source)[0].text

            #  主体内容
            content = StringIO.StringIO("")
            for section in tree.xpath(xpath_content):
                content.write(section.text)
        except:
            pass

        print "herf: ", herf
        print "title: ", title
        print "pub_time: ", pub_time
        print "source: ", source
        print "content: ", content.getvalue()
Example #6
0
def net_test(config_path = 'config.xml'):
    print "net_test ==>  "
    
    html = get_html_from_url("http://news.163.com/13/0727/16/94Q724990001124J.html")

    charset = get_charset_of_html(html)
    print "ifeng charset: ", charset
    doc = HTML.fromstring(html.decode(charset, 'ignore'))
    
    ss = StringIO('')
    #doc = HTML.fromstring(html.decode('utf8', 'ignore'))
    #doc = HTML.fromstring(html)
    result = doc.xpath("//div[@class='left']")
    print "len ", len(result)
    
    for ret in result: 
        value = ret.text_content().strip()
        value = re.sub('[\r\n]','',value)
    print value

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间:", re.search(regx, value).groups(0)[0]

    #regx = u'来源[::\s]*([^\s]*)[有]*'
    #print "来源:", re.search(regx, value).groups(0)[0].strip()
#regx = u'\d|-|:|(有.*)'
    
    regx = u'来源[::\s]*([^\s]+)有.*'
    
    print "来源:", re.search(regx, value).groups(0)[0].strip()
Example #7
0
def parse_config(config_path = "config.xml"):
    print "parse_xml ==>  "
    
    html = get_html_from_url("http://finance.sina.com.cn/china/20130727/015816259014.shtml")

    #print html[:600]

    ss = StringIO('')

    charset = get_charset_of_html(html)
    doc = HTML.fromstring(html.decode(charset, 'ignore'))
    
    #doc = HTML.fromstring(html)
    result = doc.xpath('//div[@class="artInfo"]')
    #result = doc.xpath("//div[@id='artibody']/p")
    print "len ", len(result)
    for ret in result: 
        value = ret.text_content().strip()

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间:", re.search(regx, value).groups(0)[0]

    print value
    regx = u':\d{2}([^\s]*)[\s]*'
    print "来源:", re.search(regx, value).groups(0)[0].strip()
Example #8
0
def ifeng_test(config_path = 'config.xml'):
    print "sohu_test ==>  "
    
    html = get_html_from_url("http://news.ifeng.com/society/1/detail_2013_07/27/27973995_0.shtml")
    print html[:800]

    charset = get_charset_of_html(html)
    print "ifeng charset: ", charset
    doc = HTML.fromstring(html.decode(charset, 'ignore'))
    
    ss = StringIO('')
    #doc = HTML.fromstring(html.decode('utf8', 'ignore'))
    #doc = HTML.fromstring(html)
    result = doc.xpath("//div[@id='artical_sth']/p")
    print "len ", len(result)
    
    for ret in result: 
        value = ret.text_content().strip()
        value = re.sub('[\r\n]','',value)
    print value

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间:", re.search(regx, value).groups(0)[0]

    regx = u'来源[::]*([^\s]*)[\s]*'
    print "来源:", re.search(regx, value).groups(0)[0].strip()


    xpath_cont = "//div[@id='main_content']/p"
    conts = doc.xpath(xpath_cont)
    if conts is not None:
        for cont in conts:
            value = cont.text_content().strip()
            print value
Example #9
0
def test_parse_href_list_of_url(url, node):
    print "parse_herf_list_of_url ==> "

    
    # 记录解析出来的新闻链接列表
    href_list = []
    string = StringIO.StringIO('')

    # 判断配置文件中index相关标签节点是否存在
    if node == None:
        print "[Error]: index tag of config.xml error"
        return href_list
    
    # 获取网页内容
    html = get_html_from_url(url)
    try:
        xpath_list = node.findall('xpath')
        regx_page = node.find('regx_page').text
    except:
        print 'xpath regx_page config of ', url, ' error'
        return href_list

    # 获取网页编码格式,并进行解码 
    charset = get_charset_of_html(html)
    print "charset: ", charset
    html = html.decode(charset, 'ignore')

    print "regx_page", regx_page

    # 根据xpath解析top news的链接和标题
    doc = HTML.fromstring(html)
    for xpath_item in xpath_list:
        xpath = xpath_item.text
        print "xpath: ", xpath
        news_items = doc.xpath(xpath)
        print "len: ", len(news_items)

        # 逐个解析 这个需要两个list,一个记录href 一个记录满足page url格式的
        for news_item in news_items:
            title = news_item.text
            href = news_item.get('href')

            # 判断是否是加粗字体
            print title, href
            if title is not None and re.match(regx_page, href):
                string.write(title + "\t" + href + "\n")
                href_list.append(href)

    # 将结果记录到文件中
    string.close()
    return href_list
Example #10
0
def test_parse_href_list_of_url(url, node):
    print "parse_herf_list_of_url ==> "

    # 记录解析出来的新闻链接列表
    href_list = []
    string = StringIO.StringIO("")

    # 判断配置文件中index相关标签节点是否存在
    if node == None:
        print "[Error]: index tag of config.xml error"
        return href_list

    # 获取网页内容
    html = get_html_from_url(url)
    try:
        xpath_list = node.findall("xpath")
        regx_page = node.find("regx_page").text
    except:
        print "xpath regx_page config of ", url, " error"
        return href_list

    # 获取网页编码格式,并进行解码
    charset = get_charset_of_html(html)
    print "charset: ", charset
    html = html.decode(charset, "ignore")

    print "regx_page", regx_page

    # 根据xpath解析top news的链接和标题
    doc = HTML.fromstring(html)
    for xpath_item in xpath_list:
        xpath = xpath_item.text
        print "xpath: ", xpath
        news_items = doc.xpath(xpath)
        print "len: ", len(news_items)

        # 逐个解析 这个需要两个list,一个记录href 一个记录满足page url格式的
        for news_item in news_items:
            title = news_item.text
            href = news_item.get("href")

            # 判断是否是加粗字体
            print title, href
            if title is not None and re.match(regx_page, href):
                string.write(title + "\t" + href + "\n")
                href_list.append(href)

    # 将结果记录到文件中
    string.close()
    return href_list
Example #11
0
def sohu_test(config_path = 'config.xml'):
    print "sohu_test ==>  "
    
    html = get_html_from_url("http://business.sohu.com/20130727/n382723680.shtml")

    ss = StringIO('')
    doc = HTML.fromstring(html)
    result = doc.xpath("//div[@class='time-source']")
    print "len ", len(result)
    
    for ret in result: 
        value = ret.text_content().strip()
        #value = re.sub('[\r\n]','',value)
    print value

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间:", re.search(regx, value).groups(0)[0]
    
    regx = u'来源[::](.*)(作者)*'
    print "时间:", re.search(regx, value).groups(0)[0].strip()
Example #12
0
def sohu_test(config_path='config.xml'):
    print "sohu_test ==>  "

    html = get_html_from_url(
        "http://business.sohu.com/20130727/n382723680.shtml")

    ss = StringIO('')
    doc = HTML.fromstring(html)
    result = doc.xpath("//div[@class='time-source']")
    print "len ", len(result)

    for ret in result:
        value = ret.text_content().strip()
        #value = re.sub('[\r\n]','',value)
    print value

    regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})'
    print "时间:", re.search(regx, value).groups(0)[0]

    regx = u'来源[::](.*)(作者)*'
    print "时间:", re.search(regx, value).groups(0)[0].strip()
Example #13
0
def parse_interest_of_href_list(href_list, node):
    print "parse_interest_of_href_list ==> "

    if node == None:
        print "page tag of config.xml error"
        return None

    # 获取感兴趣的属性字典
    feature_map = {}
    for feature in node.iterchildren():
        tmap = {}
        for child in feature.iterchildren():
            tmap[child.tag] = child.text
        feature_map[feature.tag] = tmap

    # 获取待抓取的属性值
    for href in href_list:
        html = get_html_from_url(href)
        if html == "":
            continue
        parse_interest_of_html(html, feature_map)
Example #14
0
def parse_href_list_of_url(url, node):
    print "parse_herf_list_of_url ==> "

    # 记录解析出来的新闻链接列表
    href_list = []
    string = StringIO.StringIO('')

    if node == None:
        print "[Error]: index tag of config.xml error"
        return href_list
    
    # 获取网页内容
    html = get_html_from_url(url)
    try:
        xpath = node.find('xpath').text
        regx_page = node.find('regx_page').text
    except:
        print 'xpath regx_page config of ', url, ' error'
        return href_list

    print "xpath", xpath
    print "regx_page", regx_page

    # 根据xpath解析top news的链接和标题
    doc = HTML.fromstring(html)
    news_items = doc.xpath(xpath)
    print "len: ", len(news_items)

    # 逐个解析
    for news_item in news_items:
        title = news_item.text
        href = news_item.get('href')
        if title != None and re.match(regx_page, href):
            print title, href
            string.write(title + "\t" + href + "\n")
            href_list.append(href)

    # 将结果记录到文件中
    string.close()
    return href_list
Example #15
0
def parse_href_list_of_url(url, node):
    print "parse_herf_list_of_url ==> "

    # 记录解析出来的新闻链接列表
    href_list = []
    string = StringIO.StringIO("")

    if node == None:
        print "[Error]: index tag of config.xml error"
        return href_list

    # 获取网页内容
    html = get_html_from_url(url)
    try:
        xpath = node.find("xpath").text
        regx_page = node.find("regx_page").text
    except:
        print "xpath regx_page config of ", url, " error"
        return href_list

    print "xpath", xpath
    print "regx_page", regx_page

    # 根据xpath解析top news的链接和标题
    doc = HTML.fromstring(html)
    news_items = doc.xpath(xpath)
    print "len: ", len(news_items)

    # 逐个解析
    for news_item in news_items:
        title = news_item.text
        href = news_item.get("href")
        if title != None and re.match(regx_page, href):
            print title, href
            string.write(title + "\t" + href + "\n")
            href_list.append(href)

    # 将结果记录到文件中
    string.close()
    return href_list
Example #16
0
def parse_interest_of_href_list(href_list, node):
    print "parse_interest_of_href_list ==> "

    if node == None:
        print "page tag of config.xml error"
        return None

    # 获取感兴趣的属性字典
    feature_map = {}
    for feature in node.iterchildren():
        tmap = {}
        for child in feature.iterchildren():
            tmap[child.tag] = child.text
        feature_map[feature.tag] = tmap

    
    # 获取待抓取的属性值
    for href in href_list:
        html = get_html_from_url(href)
        if html == '':
            continue
        parse_interest_of_html(html, feature_map)