def main():
    clipNoList = []
    volumeNo = None
    memberNo = None

    m = re.search(r'volumeNo=(?P<volumeNo>\d+)&memberNo=(?P<memberNo>\d+)', sys.argv[1])
    if m:
        volumeNo = int(m.group("volumeNo"))
        memberNo = int(m.group("memberNo"))

    lineList = feedmakerutil.readStdinAsLineList()
    for line in lineList:
        line = line.rstrip()
        print(line)

    failureCount = 0
    link_prefix = "http://m.post.naver.com/viewer/clipContentJson.nhn?volumeNo=%d&memberNo=%d&clipNo=" % (volumeNo, memberNo)
    for clipNo in range(30):
        link = link_prefix + str(clipNo)
        cmd = 'wget.sh "%s" utf8' % (link)
        #print(cmd)
        result = feedmakerutil.execCmd(cmd)
        if result == False or re.search(r'"notExistClip"', result):
            failureCount = failureCount + 1
            if failureCount > 2:
                break
        else:
            clipContent = json.loads(result)["clipContent"]
            clipContent = re.sub(r'src="{{{[^\|]*\|/([^\|]+)\|\d+|\d+}}}"', 'src="http://post.phinf.naver.net/\1"', clipContent)
            clipContent = re.sub(r'<img src=\'http://static.post.naver.net/image/im/end/toast_flick.png\'/>', '', clipContent)
            print(clipContent)
def main():
    secondPageUrl = ""

    for line in feedmakerutil.readStdinAsLineList():
        line = line.rstrip()
        m = re.search(
            r"<a href='(?P<secondPageUrl>http://[^']+)'[^>]*><img src='http://cwstatic\.asiae\.co\.kr/images/cartoon/btn_s\.gif'/>",
            line,
        )
        if m:
            secondPageUrl = m.group("secondPageUrl")
        else:
            m = re.search(r"<a href='(?P<secondPageUrl>http://stoo.asiae.co.kr/cartoon/view.htm[^']*)'>2페이지</a>", line)
            if m:
                secondPageUrl = m.group("secondPageUrl")
            else:
                m = re.search(r"<img src='(?P<imgUrl>http://cwcontent[^']+)'.*/>", line)
                if m:
                    imgUrl = m.group("imgUrl")
                    print("<img src='%s' width='100%%'/>" % (imgUrl))

    if secondPageUrl != "":
        cmd = "wget.sh '%s' | extract_element.py extraction" % (secondPageUrl)
        # print(cmd)
        result = feedmakerutil.execCmd(cmd)
        if result:
            for line in result.split("\n"):
                m = re.search(r"<img\s*[^>]*src=(?:\'|\")(?P<imgUrl>http://cwcontent[^\'\"]+)(?:\'|\").*/>", line)
                if m:
                    imgUrl = m.group("imgUrl")
                    print("<img src='%s' width='100%%'/>" % (imgUrl))
def main():
    imgHost = ""
    imgPath = ""
    imgIndex = -1
    imgExt = "jpg"
    pageUrl = sys.argv[1]

    for line in feedmakerutil.readStdinAsLineList():
        line = line.rstrip()
        if re.search(r"<(meta|style)", line):
            print(line)
        else:
            m = re.search(r"<img src='http://(?P<imgHost>imgcomic.naver.(?:com|net))/(?P<imgPath>[^']+_)(?P<imgIndex>\d+)\.(?P<imgExt>jpg|gif)", line, re.IGNORECASE)
            if m:
                imgHost = m.group("imgHost")
                imgPath = m.group("imgPath")
                imgIndex = int(m.group("imgIndex"))
                imgExt = m.group("imgExt")
                print("<img src='http://%s/%s%d.%s' width='100%%'/>" % (imgHost, imgPath, imgIndex, imgExt))
            else:
                m = re.search(r"<img src='http://(?P<imgHost>imgcomic.naver.(?:com|net))/(?P<imgPath>[^']+)\.(?P<imgExt>jpg|gif)", line, re.IGNORECASE)
                if m:
                    imgHost = m.group("imgHost")
                    imgPath = m.group("imgPath")
                    imgExt = m.group("imgExt")
                    print("<img src='http://%s/%s.%s' width='100%%'/>" % (imgHost, imgPath, imgExt))
        
    if imgPath != "" and imgIndex >= 0:
        # add some additional images loaded dynamically
        for i in range(60):
            imgUrl = "http://%s/%s%d.%s" % (imgHost, imgPath, i, imgExt)
            cmd = 'wget.sh --spider --referer "%s" "%s"' % (pageUrl, imgUrl)
            result = feedmakerutil.execCmd(cmd)
            if result:
                print("<img src='http://%s/%s%d.%s' width='100%%'/>" % (imgHost, imgPath, i, imgExt))
def main():
    urlList = []
    encoding = "cp949"

    for line in feedmakerutil.readStdinAsLineList():
        line = line.rstrip()
        m = re.search(r"<a href='(?P<url>http://[^']+)'[^>]*>\d+</a>", line)
        if m:
            urlList.append(m.group("url"))
        else:
            m = re.search(r"<img src='(?P<url>[^']+)'[^>]*>", line)
            if m:
                print("<img src='%s' width='100%%'/>" % (m.group("url")))

    for url in urlList:
        cmd = "wget.sh '%s' %s" % (url, encoding)
        result = feedmakerutil.execCmd(cmd)
        if result:
            for line in result.split("\n"):
                m = re.search(r"<img\s*[^>]*src=(?:'|\")?(?P<url>http://images.sportskhan.net/article/[^'\"\s]+)(?:'|\")?[^>]*>", line)
                if m:
                    print("<img src='%s' width='100%%'/>" % (m.group("url")))
def main():
    secondPageUrl = ""
    imgUrlList = []

    for line in feedmakerutil.readStdinAsLineList():
        line = line.rstrip()
        m = re.search(r"<img src='(?P<imgUrl>http://comicmenu.mt.co.kr/[^']+.jpg)'(?: width='\d+%')?/>", line)
        if m:
            imgUrl = m.group("imgUrl")
            # 광고 이미지 skip
            if re.search(r"http://comicmenu.mt.co.kr/banner/comic_\d+_100811.jpg", imgUrl):
                continue
            imgUrlList.append(imgUrl)

        m = re.search(r"<a href='(?P<secondPageUrl>http://[^']+)'[^>]*>", line)
        if m:
            secondPageUrl = m.group("secondPageUrl")
        else:
            m = re.search(r"<img src='http://comicmenu\.mt\.co\.kr/images/btn_cartoon_2p\.gif'/>", line)
            if m:
                break

    # 마지막 광고 이미지 제거
    imgUrlList.pop()
            
    if secondPageUrl != "":
        cmd = "wget.sh '%s' | extract.py '%s'" % (secondPageUrl, sys.argv[1])
        result = feedmakerutil.execCmd(cmd)
        for line in result.split("\n"):
            m = re.search(r"<img src='(?P<imgUrl>http://comicmenu.mt.co.kr/[^']+.jpg)'(?: width='\d+%')?/>", line)
            if m:
                imgUrl = m.group("imgUrl")
                if re.search(r"http://comicmenu.mt.co.kr/banner/comic_\d+_100811.jpg", imgUrl):
                    continue
                imgUrlList.append(imgUrl)

    for imgUrl in imgUrlList:
        print("<img src='%s' width='100%%'/>" % (imgUrl))
def main():
    imgPrefix = ""
    imgIndex = -1
    imgExt = "jpg"
    numUnits = 25

    for line in feedmakerutil.readStdinAsLineList():
        line = line.rstrip()
        print(line)

    postLink = sys.argv[1]
    m = re.search(r"http://cartoon\.media\.daum\.net/(?P<mobile>m/)?webtoon/viewer/(?P<episodeId>\d+)$", postLink)
    if m:
        mobile = m.group("mobile")
        episodeId = m.group("episodeId")
        cmd = ""
        url = ""
        if mobile and mobile == "m/":
            url = "http://cartoon.media.daum.net/data/mobile/webtoon/viewer?id=" + episodeId
        else:
            url = "http://cartoon.media.daum.net/webtoon/viewer_images.js?webtoon_episode_id=" + episodeId
        cmd = "wget.sh '%s'" % (url)
        #print(cmd)
        result = feedmakerutil.execCmd(cmd)
        #print(result)
        if not result:
            die("can't download the page html from '%s'" % (url))
        img_file_arr = []
        img_url_arr = []
        img_size_arr = []
        for line in re.split(r"}|\n", result):
            m = re.search(r"\"url\":\"(?P<imgUrl>http://[^\"]+)\",(?:(?:.*imageOrder)|(?:\s*$))", line)
            if m:
                imgUrl = m.group("imgUrl")
                if re.search(r"VodPlayer\.swf", imgUrl):
                    continue
                print("<img src='%s' width='100%%'/>" % (imgUrl))
Esempio n. 7
0
def traverseElement(element, url, encoding):
    global footnoteNum
    ret = -1

    #print("# traverseElement()")
    if isinstance(element, Comment):
        # skip sub-elements
        return ret
    elif not hasattr(element, 'name') or element.name == None:
        # text or self-close element (<br/>)
        p = re.compile("^\s*$")
        if not p.match(str(element)):
            sys.stdout.write("%s" % html.escape(str(element)))
        ret = 1
        return ret
    else:
        # element
        #print("#%s#" % element.name)

        # 원칙
        # 모든 element는 그 안에 다른 element나 text를 포함한다.
        # 그러므로 open tag를 써주고 그 다음에 recursive call로 처리하고
        # close tag를 써주면 된다.
        #
        # 예외 처리
        # 이미지는 src attribute를 써줘야 함, 뒤에 <br/>을 덧붙여야 함
        # naver.net을 포함하고 /17.jpg로 끝나는 이미지 경로는 제거해야 함
        # 테이블 관련 태그는 모두 무시?
        # 자바스크립트?
        # flash?

        openCloseTag = False
        if element.name == "p":
            print("<p>")
            for e in element.contents:
                ret = traverseElement(e, url, encoding)
            # 하위 노드를 처리하고 return하지 않으면, 텍스트를 직접
            # 감싸고 있는 <p>의 경우, 중복된 내용이 노출될 수 있음
            print("</p>")
            ret = 1
            return ret
        elif element.name == "img":
            src = ""
            if element.has_attr("data-lazy-src"):
                dataLazySrc = element["data-lazy-src"]
                if not re.search(r'(https?:)?//', dataLazySrc):
                    dataLazySrc = feedmakerutil.concatenateUrl(
                        url, dataLazySrc)
                src = dataLazySrc
            elif element.has_attr("lazysrc"):
                lazySrc = element["lazysrc"]
                if not re.search(r'(https?:)?//', lazySrc):
                    lazySrc = feedmakerutil.concatenateUrl(url, lazySrc)
                src = lazySrc
            elif element.has_attr("data-src"):
                dataSrc = element["data-src"]
                if not re.search(r'(https?:)?//', dataSrc):
                    dataSrc = feedmakerutil.concatenateUrl(url, dataSrc)
                src = dataSrc
            elif element.has_attr("data-original"):
                dataSrc = element["data-original"]
                if not re.search(r'(https?:)?//', dataSrc):
                    dataSrc = feedmakerutil.concatenateUrl(url, dataSrc)
                src = dataSrc
            elif element.has_attr("src"):
                src = element["src"]
                if not re.search(r'(https?:)?//', src):
                    src = feedmakerutil.concatenateUrl(url, src)
                if "ncc.phinf.naver.net" in src and ("/17.jpg" in src
                                                     or "/8_17px.jpg" in src
                                                     or "/7px.jpg" in src
                                                     or "/20px.jpg" in src):
                    # 외부에서 접근 불가능한 이미지 제거
                    return ret
            if src and src != "":
                if re.search(r'^//', src):
                    src = re.sub(r'^//', 'http://', src)
                sys.stdout.write("<img src='%s'" % src)
            if element.has_attr("width"):
                sys.stdout.write(" width='%s'" % element["width"])
            sys.stdout.write("/>\n")
            ret = 1
        elif element.name in ("input"):
            if checkElementClass(element, "input", "originSrc"):
                if element.has_attr("value"):
                    value = element["value"]
                    if not re.search(r'(https?:)?//', value):
                        value = feedmakerutil.concatenateUrl(url, value)
                    sys.stdout.write("<img src='%s'/>\n" % value)
                    ret = 1
        elif element.name == "canvas":
            src = ""
            if element.has_attr("data-original"):
                src = element["data-original"]
            elif element.has_attr("data-src"):
                src = element["data-src"]
            if src and src != "":
                sys.stdout.write("<img src='%s'" % src)
                if element.has_attr("width"):
                    sys.stdout.write(" width='%s'" % element["width"])
                sys.stdout.write("/>\n")
                ret = 1
        elif element.name == "a":
            if element.has_attr("onclick"):
                # 주석레이어 제거
                m = re.search(r"(open|close)FootnoteLayer\('(\d+)'",
                              element["onclick"])
                if m:
                    openOrClose = m.group(1)
                    if openOrClose == "open":
                        footnoteNum = m.group(2)
                    return ret
            if element.has_attr("href"):
                # complementing href value
                href = element["href"]
                if not re.search(r'(https?:)?//', href):
                    href = feedmakerutil.concatenateUrl(url, href)
                # A tag는 href와 target attribute를 출력해줘야 함
                sys.stdout.write("<a href='%s'" % href)
                if element.has_attr("target"):
                    sys.stdout.write(" target='%s'>\n" % element["target"])
                else:
                    sys.stdout.write(">")
                ret = 1
                openCloseTag = True
        elif element.name in ("iframe", "embed"):
            if element.has_attr("src"):
                src = element["src"]
                if "video_player.nhn" in src or ".swf" in src or "getCommonPlayer.nhn" in src:
                    # flash 파일은 [IFRAME with Flash]라고 표시
                    print("[Flash Player]<br/>")
                    print("<%s src='%s'></%s><br/>" %
                          (element.name, src, element.name))
                    print("<a href='%s'>%s</a><br/>" % (src, src))
                else:
                    sys.stdout.write("%s\n" % str(element))
                ret = 1
        elif element.name in ("param", "object"):
            if element.has_attr(
                    "name") and element["name"] == "Src" and element.has_attr(
                        "value") and ".swf" in element["value"]:
                src = element["value"]
                print("[Flash Player]<br/>")
                print("<video src='%s'></video><br/>" % (src))
                print("<a href='%s'>%s</a><br/>" % (src, src))
            ret = 1
        elif element.name == "map":
            # image map
            # extract only link information from area element
            for child in element.contents:
                if hasattr(child, "name") and child.name == "area":
                    linkHref = "#"
                    linkTitle = "empty link title"
                    if child.has_attr("href"):
                        linkHref = child["href"]
                    if child.has_attr("alt"):
                        linkTitle = child["alt"]
                    print(
                        "<br/><br/><strong><a href='%s'>%s</a></strong><br/><br/>"
                        % (linkHref, linkTitle))
                    ret = 1
                elif element.name in ("o:p", "st1:time"):
                    # skip unknown element
                    return ret
        elif element.name in ("script"):
            # skip sub-element
            return ret
        elif element.name in ("v:shapetype", "qksdmssnfl", "qksdmssnfl<span"):
            # skip malformed element
            return ret
        elif element.name in ("style", "st1:personname", "script"):
            # skip sub-elements
            return ret
        elif element.name in ("xmp", "form"):
            ret = 1
        else:
            if checkElementClass(element, "div", "paginate_v1"):
                # <div class="paginate_v1">...
                # ajax로 받아오는 페이지들을 미리 요청
                matches = re.findall(
                    r"change_page\('[^']+/literature_module/(\d+)/literature_(\d+)_(\d+)\.html'",
                    str(element))
                for match in matches:
                    leafId = int(match[0])
                    articleNum = int(match[1])
                    pageNum = int(match[2])
                    url = "http://navercast.naver.com/ncc_request.nhn?url=http://data.navercast.naver.com/literature_module/%d/literature_%d_%d.html" % (
                        leafId, articleNum, pageNum)
                    cmd = "wget.sh '%s' | extract_literature.py" % (url)
                    #print(cmd)
                    result = feedmakerutil.execCmd(cmd)
                    if result:
                        print(result)
                    ret = 1
                return ret
            elif checkElementClass(element, "div", "view_option option_top"):
                # "오늘의 문학"에서 폰트크기와 책갈피 이미지 영역 제거
                return ret
            elif checkElementClass(element, "span",
                                   "page_prev") or checkElementClass(
                                       element, "span", "page_next"):
                # <span class="page_prev">... or <span class="page_next">...
                # 이전/다음 페이지 화살표 링크 영역 제거
                return ret
            elif checkElementClass(element, "dl", "designlist"):
                # <dl class="designlist">...
                # skip this element and sub-elements
                return ret
            elif checkElementClass(element, "div", "na_ly_cmt"):
                # <a onclick="openFootnoteLayer('번호'...)의 번호와 비교
                if hasattr(element, "id"):
                    if element["id"] != "footnoteLayer" + str(footnoteNum):
                        return ret
                    #else:
                    #print str(element)
            else:
                sys.stdout.write("<%s>\n" % element.name)
                openCloseTag = True
                ret = 1

        if hasattr(element, 'contents'):
            for e in element.contents:
                if e == "\n":
                    continue
                else:
                    ret = traverseElement(e, url, encoding)
        elif isinstance(element, Comment):
            return ret
        else:
            sys.stdout.write(element)
            ret = 1
            return ret

        if openCloseTag == True:
            sys.stdout.write("</%s>\n" % element.name)
            ret = 1

    return ret