def get_article(url): ## Section 0 - Initial set. blog_url = list() dem = list() ## Section 1 - Got frame src. dem.append(html.parse(url).getroot()) blog_url.append("http://blog.daum.net" + dem[0][1][0].attrib["src"]) # print "[System] Got blog-url[1] from iframe successfully. :", blog_url[0] ## Section 2 - Get frame src(2). dem.append(html.parse(blog_url[0]).getroot()) frames = dem[1].cssselect("iframe") for frame in frames: if "if_b" in frame.get("name"): blog_url.append("http://blog.daum.net" + frame.get("src")) # print "[System] Got blog-url[2] from iframe successfully. :", blog_url[1] ## Section 3 - Get contents of article. dem.append(html.parse(blog_url[1]).getroot()) article = dem[2].cssselect("div#contentDiv")[0] img_links = get_images(article) ## Section 4 - Return data. return st.strip_html(html.tostring(article, encoding="utf-8", method="html")), img_links
def get_article(url): contents = requests.get(url) charset = contents.encoding tree = html.fromstring(contents.content) article = tree.cssselect("div.article")[0] content = html.tostring(article, encoding=charset, method="html") img_list = get_images(article) return st.strip_html(content), img_list
def get_article(url): ## Section 0 - Initial set. dem = list() blog_url = list() user_agent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13" ## Section 1 - Got frame src. contents = requests.get(url) charset = contents.encoding dem.append(html.fromstring(contents.text)) if "blog.cyworld.com" in url: blog_url.append(dem[0][3][3].get("src")) else: blog_url.append(dem[len(dem) - 1][1][1].get("src")) ## Section 2 - Got frame src(2) if not short url. contents = requests.get(blog_url[len(blog_url) - 1], headers={"User-Agent": user_agent}) dem.append(html.fromstring(contents.text)) blog_url.append(dem[len(dem) - 1][3][3].get("src")) ## Section 3 - Got frame src(3). contents = requests.get(blog_url[len(blog_url) - 1], headers={"User-Agent": user_agent}) dem.append(html.fromstring(contents.text)) for frame in dem[len(dem) - 1].cssselect("iframe"): if frame.get("src") and "myhompy" in frame.get("src"): blog_url.append("http://web3.c2.cyworld.com" + frame.get("src")) ## Section 4 - Got content of article contents = requests.get(blog_url[len(blog_url) - 1], headers={"User-Agent": user_agent}) dem.append(html.fromstring(contents.text)) article = dem[len(dem) - 1].cssselect("div#myhompy_board_retrieveBoard_contents")[0] content = html.tostring(article, encoding=charset, method="html") img_list = get_images(article) return st.strip_html(content), img_list