Esempio n. 1
0
def get_article(url):
    ## Section 0 - Initial set.
    blog_url = list()
    dem = list()

    ## Section 1 - Got frame src.
    dem.append(html.parse(url).getroot())
    blog_url.append("http://blog.daum.net" + dem[0][1][0].attrib["src"])
    # print "[System] Got blog-url[1] from iframe successfully. :", blog_url[0]

    ## Section 2 - Get frame src(2).
    dem.append(html.parse(blog_url[0]).getroot())
    frames = dem[1].cssselect("iframe")
    for frame in frames:
        if "if_b" in frame.get("name"):
            blog_url.append("http://blog.daum.net" + frame.get("src"))
    # print "[System] Got blog-url[2] from iframe successfully. :", blog_url[1]

    ## Section 3 - Get contents of article.
    dem.append(html.parse(blog_url[1]).getroot())
    article = dem[2].cssselect("div#contentDiv")[0]

    img_links = get_images(article)

    ## Section 4 - Return data.
    return st.strip_html(html.tostring(article, encoding="utf-8", method="html")), img_links
Esempio n. 2
0
def get_article(url):
    contents = requests.get(url)
    charset = contents.encoding
    tree = html.fromstring(contents.content)
    article = tree.cssselect("div.article")[0]
    content = html.tostring(article, encoding=charset, method="html")

    img_list = get_images(article)

    return st.strip_html(content), img_list
Esempio n. 3
0
def get_article(url):
    ## Section 0 - Initial set.
    dem = list()
    blog_url = list()
    user_agent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13"

    ## Section 1 - Got frame src.
    contents = requests.get(url)
    charset = contents.encoding
    dem.append(html.fromstring(contents.text))

    if "blog.cyworld.com" in url:
        blog_url.append(dem[0][3][3].get("src"))
    else:
        blog_url.append(dem[len(dem) - 1][1][1].get("src"))

        ## Section 2 - Got frame src(2) if not short url.
        contents = requests.get(blog_url[len(blog_url) - 1], headers={"User-Agent": user_agent})
        dem.append(html.fromstring(contents.text))
        blog_url.append(dem[len(dem) - 1][3][3].get("src"))

    ## Section 3 - Got frame src(3).
    contents = requests.get(blog_url[len(blog_url) - 1], headers={"User-Agent": user_agent})
    dem.append(html.fromstring(contents.text))

    for frame in dem[len(dem) - 1].cssselect("iframe"):
        if frame.get("src") and "myhompy" in frame.get("src"):
            blog_url.append("http://web3.c2.cyworld.com" + frame.get("src"))

    ## Section 4 - Got content of article
    contents = requests.get(blog_url[len(blog_url) - 1], headers={"User-Agent": user_agent})
    dem.append(html.fromstring(contents.text))

    article = dem[len(dem) - 1].cssselect("div#myhompy_board_retrieveBoard_contents")[0]
    content = html.tostring(article, encoding=charset, method="html")
    img_list = get_images(article)

    return st.strip_html(content), img_list