def net_test(config_path='config.xml'): print "net_test ==> " html = get_html_from_url( "http://news.163.com/13/0727/16/94Q724990001124J.html") charset = get_charset_of_html(html) print "ifeng charset: ", charset doc = HTML.fromstring(html.decode(charset, 'ignore')) ss = StringIO('') #doc = HTML.fromstring(html.decode('utf8', 'ignore')) #doc = HTML.fromstring(html) result = doc.xpath("//div[@class='left']") print "len ", len(result) for ret in result: value = ret.text_content().strip() value = re.sub('[\r\n]', '', value) print value regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})' print "时间:", re.search(regx, value).groups(0)[0] #regx = u'来源[::\s]*([^\s]*)[有]*' #print "来源:", re.search(regx, value).groups(0)[0].strip() #regx = u'\d|-|:|(有.*)' regx = u'来源[::\s]*([^\s]+)有.*' print "来源:", re.search(regx, value).groups(0)[0].strip()
def ifeng_test(config_path='config.xml'): print "sohu_test ==> " html = get_html_from_url( "http://news.ifeng.com/society/1/detail_2013_07/27/27973995_0.shtml") print html[:800] charset = get_charset_of_html(html) print "ifeng charset: ", charset doc = HTML.fromstring(html.decode(charset, 'ignore')) ss = StringIO('') #doc = HTML.fromstring(html.decode('utf8', 'ignore')) #doc = HTML.fromstring(html) result = doc.xpath("//div[@id='artical_sth']/p") print "len ", len(result) for ret in result: value = ret.text_content().strip() value = re.sub('[\r\n]', '', value) print value regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})' print "时间:", re.search(regx, value).groups(0)[0] regx = u'来源[::]*([^\s]*)[\s]*' print "来源:", re.search(regx, value).groups(0)[0].strip() xpath_cont = "//div[@id='main_content']/p" conts = doc.xpath(xpath_cont) if conts is not None: for cont in conts: value = cont.text_content().strip() print value
def parse_config(config_path="config.xml"): print "parse_xml ==> " html = get_html_from_url( "http://finance.sina.com.cn/china/20130727/015816259014.shtml") #print html[:600] ss = StringIO('') charset = get_charset_of_html(html) doc = HTML.fromstring(html.decode(charset, 'ignore')) #doc = HTML.fromstring(html) result = doc.xpath('//div[@class="artInfo"]') #result = doc.xpath("//div[@id='artibody']/p") print "len ", len(result) for ret in result: value = ret.text_content().strip() regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})' print "时间:", re.search(regx, value).groups(0)[0] print value regx = u':\d{2}([^\s]*)[\s]*' print "来源:", re.search(regx, value).groups(0)[0].strip()
def get_and_parse_herf_list(herf_list): print "get_and_parse_herf_list->" for herf in herf_list: html = get_html_from_url(herf) tree = etree.HTML(html.decode('gbk', 'ignore')) try: # 获取主题 title = tree.xpath(xpath_title)[0].text # 获取时间 pub_time_raw = tree.xpath(xpath_pub_time)[0].text for match in re.finditer(regx_pub_time, pub_time_raw): pub_time = match.group() # 获取来源 source = tree.xpath(xpath_source)[0].text # 主体内容 content = StringIO.StringIO("") for section in tree.xpath(xpath_content): content.write(section.text) except: pass print "herf: ", herf print "title: ", title print "pub_time: ", pub_time print "source: ", source print "content: ", content.getvalue()
def get_and_parse_herf_list(herf_list): print "get_and_parse_herf_list->" for herf in herf_list: html = get_html_from_url(herf) tree = etree.HTML(html.decode("gbk", "ignore")) try: # 获取主题 title = tree.xpath(xpath_title)[0].text # 获取时间 pub_time_raw = tree.xpath(xpath_pub_time)[0].text for match in re.finditer(regx_pub_time, pub_time_raw): pub_time = match.group() # 获取来源 source = tree.xpath(xpath_source)[0].text # 主体内容 content = StringIO.StringIO("") for section in tree.xpath(xpath_content): content.write(section.text) except: pass print "herf: ", herf print "title: ", title print "pub_time: ", pub_time print "source: ", source print "content: ", content.getvalue()
def net_test(config_path = 'config.xml'): print "net_test ==> " html = get_html_from_url("http://news.163.com/13/0727/16/94Q724990001124J.html") charset = get_charset_of_html(html) print "ifeng charset: ", charset doc = HTML.fromstring(html.decode(charset, 'ignore')) ss = StringIO('') #doc = HTML.fromstring(html.decode('utf8', 'ignore')) #doc = HTML.fromstring(html) result = doc.xpath("//div[@class='left']") print "len ", len(result) for ret in result: value = ret.text_content().strip() value = re.sub('[\r\n]','',value) print value regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})' print "时间:", re.search(regx, value).groups(0)[0] #regx = u'来源[::\s]*([^\s]*)[有]*' #print "来源:", re.search(regx, value).groups(0)[0].strip() #regx = u'\d|-|:|(有.*)' regx = u'来源[::\s]*([^\s]+)有.*' print "来源:", re.search(regx, value).groups(0)[0].strip()
def parse_config(config_path = "config.xml"): print "parse_xml ==> " html = get_html_from_url("http://finance.sina.com.cn/china/20130727/015816259014.shtml") #print html[:600] ss = StringIO('') charset = get_charset_of_html(html) doc = HTML.fromstring(html.decode(charset, 'ignore')) #doc = HTML.fromstring(html) result = doc.xpath('//div[@class="artInfo"]') #result = doc.xpath("//div[@id='artibody']/p") print "len ", len(result) for ret in result: value = ret.text_content().strip() regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})' print "时间:", re.search(regx, value).groups(0)[0] print value regx = u':\d{2}([^\s]*)[\s]*' print "来源:", re.search(regx, value).groups(0)[0].strip()
def ifeng_test(config_path = 'config.xml'): print "sohu_test ==> " html = get_html_from_url("http://news.ifeng.com/society/1/detail_2013_07/27/27973995_0.shtml") print html[:800] charset = get_charset_of_html(html) print "ifeng charset: ", charset doc = HTML.fromstring(html.decode(charset, 'ignore')) ss = StringIO('') #doc = HTML.fromstring(html.decode('utf8', 'ignore')) #doc = HTML.fromstring(html) result = doc.xpath("//div[@id='artical_sth']/p") print "len ", len(result) for ret in result: value = ret.text_content().strip() value = re.sub('[\r\n]','',value) print value regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})' print "时间:", re.search(regx, value).groups(0)[0] regx = u'来源[::]*([^\s]*)[\s]*' print "来源:", re.search(regx, value).groups(0)[0].strip() xpath_cont = "//div[@id='main_content']/p" conts = doc.xpath(xpath_cont) if conts is not None: for cont in conts: value = cont.text_content().strip() print value
def test_parse_href_list_of_url(url, node): print "parse_herf_list_of_url ==> " # 记录解析出来的新闻链接列表 href_list = [] string = StringIO.StringIO('') # 判断配置文件中index相关标签节点是否存在 if node == None: print "[Error]: index tag of config.xml error" return href_list # 获取网页内容 html = get_html_from_url(url) try: xpath_list = node.findall('xpath') regx_page = node.find('regx_page').text except: print 'xpath regx_page config of ', url, ' error' return href_list # 获取网页编码格式,并进行解码 charset = get_charset_of_html(html) print "charset: ", charset html = html.decode(charset, 'ignore') print "regx_page", regx_page # 根据xpath解析top news的链接和标题 doc = HTML.fromstring(html) for xpath_item in xpath_list: xpath = xpath_item.text print "xpath: ", xpath news_items = doc.xpath(xpath) print "len: ", len(news_items) # 逐个解析 这个需要两个list,一个记录href 一个记录满足page url格式的 for news_item in news_items: title = news_item.text href = news_item.get('href') # 判断是否是加粗字体 print title, href if title is not None and re.match(regx_page, href): string.write(title + "\t" + href + "\n") href_list.append(href) # 将结果记录到文件中 string.close() return href_list
def test_parse_href_list_of_url(url, node): print "parse_herf_list_of_url ==> " # 记录解析出来的新闻链接列表 href_list = [] string = StringIO.StringIO("") # 判断配置文件中index相关标签节点是否存在 if node == None: print "[Error]: index tag of config.xml error" return href_list # 获取网页内容 html = get_html_from_url(url) try: xpath_list = node.findall("xpath") regx_page = node.find("regx_page").text except: print "xpath regx_page config of ", url, " error" return href_list # 获取网页编码格式,并进行解码 charset = get_charset_of_html(html) print "charset: ", charset html = html.decode(charset, "ignore") print "regx_page", regx_page # 根据xpath解析top news的链接和标题 doc = HTML.fromstring(html) for xpath_item in xpath_list: xpath = xpath_item.text print "xpath: ", xpath news_items = doc.xpath(xpath) print "len: ", len(news_items) # 逐个解析 这个需要两个list,一个记录href 一个记录满足page url格式的 for news_item in news_items: title = news_item.text href = news_item.get("href") # 判断是否是加粗字体 print title, href if title is not None and re.match(regx_page, href): string.write(title + "\t" + href + "\n") href_list.append(href) # 将结果记录到文件中 string.close() return href_list
def sohu_test(config_path = 'config.xml'): print "sohu_test ==> " html = get_html_from_url("http://business.sohu.com/20130727/n382723680.shtml") ss = StringIO('') doc = HTML.fromstring(html) result = doc.xpath("//div[@class='time-source']") print "len ", len(result) for ret in result: value = ret.text_content().strip() #value = re.sub('[\r\n]','',value) print value regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})' print "时间:", re.search(regx, value).groups(0)[0] regx = u'来源[::](.*)(作者)*' print "时间:", re.search(regx, value).groups(0)[0].strip()
def sohu_test(config_path='config.xml'): print "sohu_test ==> " html = get_html_from_url( "http://business.sohu.com/20130727/n382723680.shtml") ss = StringIO('') doc = HTML.fromstring(html) result = doc.xpath("//div[@class='time-source']") print "len ", len(result) for ret in result: value = ret.text_content().strip() #value = re.sub('[\r\n]','',value) print value regx = '(\d{4}.\d{1,2}.\d{1,2}.*\d{1,2}:\d{1,2})' print "时间:", re.search(regx, value).groups(0)[0] regx = u'来源[::](.*)(作者)*' print "时间:", re.search(regx, value).groups(0)[0].strip()
def parse_interest_of_href_list(href_list, node): print "parse_interest_of_href_list ==> " if node == None: print "page tag of config.xml error" return None # 获取感兴趣的属性字典 feature_map = {} for feature in node.iterchildren(): tmap = {} for child in feature.iterchildren(): tmap[child.tag] = child.text feature_map[feature.tag] = tmap # 获取待抓取的属性值 for href in href_list: html = get_html_from_url(href) if html == "": continue parse_interest_of_html(html, feature_map)
def parse_href_list_of_url(url, node): print "parse_herf_list_of_url ==> " # 记录解析出来的新闻链接列表 href_list = [] string = StringIO.StringIO('') if node == None: print "[Error]: index tag of config.xml error" return href_list # 获取网页内容 html = get_html_from_url(url) try: xpath = node.find('xpath').text regx_page = node.find('regx_page').text except: print 'xpath regx_page config of ', url, ' error' return href_list print "xpath", xpath print "regx_page", regx_page # 根据xpath解析top news的链接和标题 doc = HTML.fromstring(html) news_items = doc.xpath(xpath) print "len: ", len(news_items) # 逐个解析 for news_item in news_items: title = news_item.text href = news_item.get('href') if title != None and re.match(regx_page, href): print title, href string.write(title + "\t" + href + "\n") href_list.append(href) # 将结果记录到文件中 string.close() return href_list
def parse_href_list_of_url(url, node): print "parse_herf_list_of_url ==> " # 记录解析出来的新闻链接列表 href_list = [] string = StringIO.StringIO("") if node == None: print "[Error]: index tag of config.xml error" return href_list # 获取网页内容 html = get_html_from_url(url) try: xpath = node.find("xpath").text regx_page = node.find("regx_page").text except: print "xpath regx_page config of ", url, " error" return href_list print "xpath", xpath print "regx_page", regx_page # 根据xpath解析top news的链接和标题 doc = HTML.fromstring(html) news_items = doc.xpath(xpath) print "len: ", len(news_items) # 逐个解析 for news_item in news_items: title = news_item.text href = news_item.get("href") if title != None and re.match(regx_page, href): print title, href string.write(title + "\t" + href + "\n") href_list.append(href) # 将结果记录到文件中 string.close() return href_list
def parse_interest_of_href_list(href_list, node): print "parse_interest_of_href_list ==> " if node == None: print "page tag of config.xml error" return None # 获取感兴趣的属性字典 feature_map = {} for feature in node.iterchildren(): tmap = {} for child in feature.iterchildren(): tmap[child.tag] = child.text feature_map[feature.tag] = tmap # 获取待抓取的属性值 for href in href_list: html = get_html_from_url(href) if html == '': continue parse_interest_of_html(html, feature_map)