def main(): clipNoList = [] volumeNo = None memberNo = None m = re.search(r'volumeNo=(?P<volumeNo>\d+)&memberNo=(?P<memberNo>\d+)', sys.argv[1]) if m: volumeNo = int(m.group("volumeNo")) memberNo = int(m.group("memberNo")) lineList = feedmakerutil.readStdinAsLineList() for line in lineList: line = line.rstrip() print(line) failureCount = 0 link_prefix = "http://m.post.naver.com/viewer/clipContentJson.nhn?volumeNo=%d&memberNo=%d&clipNo=" % (volumeNo, memberNo) for clipNo in range(30): link = link_prefix + str(clipNo) cmd = 'wget.sh "%s" utf8' % (link) #print(cmd) result = feedmakerutil.execCmd(cmd) if result == False or re.search(r'"notExistClip"', result): failureCount = failureCount + 1 if failureCount > 2: break else: clipContent = json.loads(result)["clipContent"] clipContent = re.sub(r'src="{{{[^\|]*\|/([^\|]+)\|\d+|\d+}}}"', 'src="http://post.phinf.naver.net/\1"', clipContent) clipContent = re.sub(r'<img src=\'http://static.post.naver.net/image/im/end/toast_flick.png\'/>', '', clipContent) print(clipContent)
def main(): secondPageUrl = "" for line in feedmakerutil.readStdinAsLineList(): line = line.rstrip() m = re.search( r"<a href='(?P<secondPageUrl>http://[^']+)'[^>]*><img src='http://cwstatic\.asiae\.co\.kr/images/cartoon/btn_s\.gif'/>", line, ) if m: secondPageUrl = m.group("secondPageUrl") else: m = re.search(r"<a href='(?P<secondPageUrl>http://stoo.asiae.co.kr/cartoon/view.htm[^']*)'>2페이지</a>", line) if m: secondPageUrl = m.group("secondPageUrl") else: m = re.search(r"<img src='(?P<imgUrl>http://cwcontent[^']+)'.*/>", line) if m: imgUrl = m.group("imgUrl") print("<img src='%s' width='100%%'/>" % (imgUrl)) if secondPageUrl != "": cmd = "wget.sh '%s' | extract_element.py extraction" % (secondPageUrl) # print(cmd) result = feedmakerutil.execCmd(cmd) if result: for line in result.split("\n"): m = re.search(r"<img\s*[^>]*src=(?:\'|\")(?P<imgUrl>http://cwcontent[^\'\"]+)(?:\'|\").*/>", line) if m: imgUrl = m.group("imgUrl") print("<img src='%s' width='100%%'/>" % (imgUrl))
def main(): imgHost = "" imgPath = "" imgIndex = -1 imgExt = "jpg" pageUrl = sys.argv[1] for line in feedmakerutil.readStdinAsLineList(): line = line.rstrip() if re.search(r"<(meta|style)", line): print(line) else: m = re.search(r"<img src='http://(?P<imgHost>imgcomic.naver.(?:com|net))/(?P<imgPath>[^']+_)(?P<imgIndex>\d+)\.(?P<imgExt>jpg|gif)", line, re.IGNORECASE) if m: imgHost = m.group("imgHost") imgPath = m.group("imgPath") imgIndex = int(m.group("imgIndex")) imgExt = m.group("imgExt") print("<img src='http://%s/%s%d.%s' width='100%%'/>" % (imgHost, imgPath, imgIndex, imgExt)) else: m = re.search(r"<img src='http://(?P<imgHost>imgcomic.naver.(?:com|net))/(?P<imgPath>[^']+)\.(?P<imgExt>jpg|gif)", line, re.IGNORECASE) if m: imgHost = m.group("imgHost") imgPath = m.group("imgPath") imgExt = m.group("imgExt") print("<img src='http://%s/%s.%s' width='100%%'/>" % (imgHost, imgPath, imgExt)) if imgPath != "" and imgIndex >= 0: # add some additional images loaded dynamically for i in range(60): imgUrl = "http://%s/%s%d.%s" % (imgHost, imgPath, i, imgExt) cmd = 'wget.sh --spider --referer "%s" "%s"' % (pageUrl, imgUrl) result = feedmakerutil.execCmd(cmd) if result: print("<img src='http://%s/%s%d.%s' width='100%%'/>" % (imgHost, imgPath, i, imgExt))
def main(): urlList = [] encoding = "cp949" for line in feedmakerutil.readStdinAsLineList(): line = line.rstrip() m = re.search(r"<a href='(?P<url>http://[^']+)'[^>]*>\d+</a>", line) if m: urlList.append(m.group("url")) else: m = re.search(r"<img src='(?P<url>[^']+)'[^>]*>", line) if m: print("<img src='%s' width='100%%'/>" % (m.group("url"))) for url in urlList: cmd = "wget.sh '%s' %s" % (url, encoding) result = feedmakerutil.execCmd(cmd) if result: for line in result.split("\n"): m = re.search(r"<img\s*[^>]*src=(?:'|\")?(?P<url>http://images.sportskhan.net/article/[^'\"\s]+)(?:'|\")?[^>]*>", line) if m: print("<img src='%s' width='100%%'/>" % (m.group("url")))
def main(): secondPageUrl = "" imgUrlList = [] for line in feedmakerutil.readStdinAsLineList(): line = line.rstrip() m = re.search(r"<img src='(?P<imgUrl>http://comicmenu.mt.co.kr/[^']+.jpg)'(?: width='\d+%')?/>", line) if m: imgUrl = m.group("imgUrl") # 광고 이미지 skip if re.search(r"http://comicmenu.mt.co.kr/banner/comic_\d+_100811.jpg", imgUrl): continue imgUrlList.append(imgUrl) m = re.search(r"<a href='(?P<secondPageUrl>http://[^']+)'[^>]*>", line) if m: secondPageUrl = m.group("secondPageUrl") else: m = re.search(r"<img src='http://comicmenu\.mt\.co\.kr/images/btn_cartoon_2p\.gif'/>", line) if m: break # 마지막 광고 이미지 제거 imgUrlList.pop() if secondPageUrl != "": cmd = "wget.sh '%s' | extract.py '%s'" % (secondPageUrl, sys.argv[1]) result = feedmakerutil.execCmd(cmd) for line in result.split("\n"): m = re.search(r"<img src='(?P<imgUrl>http://comicmenu.mt.co.kr/[^']+.jpg)'(?: width='\d+%')?/>", line) if m: imgUrl = m.group("imgUrl") if re.search(r"http://comicmenu.mt.co.kr/banner/comic_\d+_100811.jpg", imgUrl): continue imgUrlList.append(imgUrl) for imgUrl in imgUrlList: print("<img src='%s' width='100%%'/>" % (imgUrl))
def main(): imgPrefix = "" imgIndex = -1 imgExt = "jpg" numUnits = 25 for line in feedmakerutil.readStdinAsLineList(): line = line.rstrip() print(line) postLink = sys.argv[1] m = re.search(r"http://cartoon\.media\.daum\.net/(?P<mobile>m/)?webtoon/viewer/(?P<episodeId>\d+)$", postLink) if m: mobile = m.group("mobile") episodeId = m.group("episodeId") cmd = "" url = "" if mobile and mobile == "m/": url = "http://cartoon.media.daum.net/data/mobile/webtoon/viewer?id=" + episodeId else: url = "http://cartoon.media.daum.net/webtoon/viewer_images.js?webtoon_episode_id=" + episodeId cmd = "wget.sh '%s'" % (url) #print(cmd) result = feedmakerutil.execCmd(cmd) #print(result) if not result: die("can't download the page html from '%s'" % (url)) img_file_arr = [] img_url_arr = [] img_size_arr = [] for line in re.split(r"}|\n", result): m = re.search(r"\"url\":\"(?P<imgUrl>http://[^\"]+)\",(?:(?:.*imageOrder)|(?:\s*$))", line) if m: imgUrl = m.group("imgUrl") if re.search(r"VodPlayer\.swf", imgUrl): continue print("<img src='%s' width='100%%'/>" % (imgUrl))
def traverseElement(element, url, encoding): global footnoteNum ret = -1 #print("# traverseElement()") if isinstance(element, Comment): # skip sub-elements return ret elif not hasattr(element, 'name') or element.name == None: # text or self-close element (<br/>) p = re.compile("^\s*$") if not p.match(str(element)): sys.stdout.write("%s" % html.escape(str(element))) ret = 1 return ret else: # element #print("#%s#" % element.name) # 원칙 # 모든 element는 그 안에 다른 element나 text를 포함한다. # 그러므로 open tag를 써주고 그 다음에 recursive call로 처리하고 # close tag를 써주면 된다. # # 예외 처리 # 이미지는 src attribute를 써줘야 함, 뒤에 <br/>을 덧붙여야 함 # naver.net을 포함하고 /17.jpg로 끝나는 이미지 경로는 제거해야 함 # 테이블 관련 태그는 모두 무시? # 자바스크립트? # flash? openCloseTag = False if element.name == "p": print("<p>") for e in element.contents: ret = traverseElement(e, url, encoding) # 하위 노드를 처리하고 return하지 않으면, 텍스트를 직접 # 감싸고 있는 <p>의 경우, 중복된 내용이 노출될 수 있음 print("</p>") ret = 1 return ret elif element.name == "img": src = "" if element.has_attr("data-lazy-src"): dataLazySrc = element["data-lazy-src"] if not re.search(r'(https?:)?//', dataLazySrc): dataLazySrc = feedmakerutil.concatenateUrl( url, dataLazySrc) src = dataLazySrc elif element.has_attr("lazysrc"): lazySrc = element["lazysrc"] if not re.search(r'(https?:)?//', lazySrc): lazySrc = feedmakerutil.concatenateUrl(url, lazySrc) src = lazySrc elif element.has_attr("data-src"): dataSrc = element["data-src"] if not re.search(r'(https?:)?//', dataSrc): dataSrc = feedmakerutil.concatenateUrl(url, dataSrc) src = dataSrc elif element.has_attr("data-original"): dataSrc = element["data-original"] if not re.search(r'(https?:)?//', dataSrc): dataSrc = feedmakerutil.concatenateUrl(url, dataSrc) src = dataSrc elif element.has_attr("src"): src = element["src"] if not re.search(r'(https?:)?//', src): src = feedmakerutil.concatenateUrl(url, src) if "ncc.phinf.naver.net" in src and ("/17.jpg" in src or "/8_17px.jpg" in src or "/7px.jpg" in src or "/20px.jpg" in src): # 외부에서 접근 불가능한 이미지 제거 return ret if src and src != "": if re.search(r'^//', src): src = re.sub(r'^//', 'http://', src) sys.stdout.write("<img src='%s'" % src) if element.has_attr("width"): sys.stdout.write(" width='%s'" % element["width"]) sys.stdout.write("/>\n") ret = 1 elif element.name in ("input"): if checkElementClass(element, "input", "originSrc"): if element.has_attr("value"): value = element["value"] if not re.search(r'(https?:)?//', value): value = feedmakerutil.concatenateUrl(url, value) sys.stdout.write("<img src='%s'/>\n" % value) ret = 1 elif element.name == "canvas": src = "" if element.has_attr("data-original"): src = element["data-original"] elif element.has_attr("data-src"): src = element["data-src"] if src and src != "": sys.stdout.write("<img src='%s'" % src) if element.has_attr("width"): sys.stdout.write(" width='%s'" % element["width"]) sys.stdout.write("/>\n") ret = 1 elif element.name == "a": if element.has_attr("onclick"): # 주석레이어 제거 m = re.search(r"(open|close)FootnoteLayer\('(\d+)'", element["onclick"]) if m: openOrClose = m.group(1) if openOrClose == "open": footnoteNum = m.group(2) return ret if element.has_attr("href"): # complementing href value href = element["href"] if not re.search(r'(https?:)?//', href): href = feedmakerutil.concatenateUrl(url, href) # A tag는 href와 target attribute를 출력해줘야 함 sys.stdout.write("<a href='%s'" % href) if element.has_attr("target"): sys.stdout.write(" target='%s'>\n" % element["target"]) else: sys.stdout.write(">") ret = 1 openCloseTag = True elif element.name in ("iframe", "embed"): if element.has_attr("src"): src = element["src"] if "video_player.nhn" in src or ".swf" in src or "getCommonPlayer.nhn" in src: # flash 파일은 [IFRAME with Flash]라고 표시 print("[Flash Player]<br/>") print("<%s src='%s'></%s><br/>" % (element.name, src, element.name)) print("<a href='%s'>%s</a><br/>" % (src, src)) else: sys.stdout.write("%s\n" % str(element)) ret = 1 elif element.name in ("param", "object"): if element.has_attr( "name") and element["name"] == "Src" and element.has_attr( "value") and ".swf" in element["value"]: src = element["value"] print("[Flash Player]<br/>") print("<video src='%s'></video><br/>" % (src)) print("<a href='%s'>%s</a><br/>" % (src, src)) ret = 1 elif element.name == "map": # image map # extract only link information from area element for child in element.contents: if hasattr(child, "name") and child.name == "area": linkHref = "#" linkTitle = "empty link title" if child.has_attr("href"): linkHref = child["href"] if child.has_attr("alt"): linkTitle = child["alt"] print( "<br/><br/><strong><a href='%s'>%s</a></strong><br/><br/>" % (linkHref, linkTitle)) ret = 1 elif element.name in ("o:p", "st1:time"): # skip unknown element return ret elif element.name in ("script"): # skip sub-element return ret elif element.name in ("v:shapetype", "qksdmssnfl", "qksdmssnfl<span"): # skip malformed element return ret elif element.name in ("style", "st1:personname", "script"): # skip sub-elements return ret elif element.name in ("xmp", "form"): ret = 1 else: if checkElementClass(element, "div", "paginate_v1"): # <div class="paginate_v1">... # ajax로 받아오는 페이지들을 미리 요청 matches = re.findall( r"change_page\('[^']+/literature_module/(\d+)/literature_(\d+)_(\d+)\.html'", str(element)) for match in matches: leafId = int(match[0]) articleNum = int(match[1]) pageNum = int(match[2]) url = "http://navercast.naver.com/ncc_request.nhn?url=http://data.navercast.naver.com/literature_module/%d/literature_%d_%d.html" % ( leafId, articleNum, pageNum) cmd = "wget.sh '%s' | extract_literature.py" % (url) #print(cmd) result = feedmakerutil.execCmd(cmd) if result: print(result) ret = 1 return ret elif checkElementClass(element, "div", "view_option option_top"): # "오늘의 문학"에서 폰트크기와 책갈피 이미지 영역 제거 return ret elif checkElementClass(element, "span", "page_prev") or checkElementClass( element, "span", "page_next"): # <span class="page_prev">... or <span class="page_next">... # 이전/다음 페이지 화살표 링크 영역 제거 return ret elif checkElementClass(element, "dl", "designlist"): # <dl class="designlist">... # skip this element and sub-elements return ret elif checkElementClass(element, "div", "na_ly_cmt"): # <a onclick="openFootnoteLayer('번호'...)의 번호와 비교 if hasattr(element, "id"): if element["id"] != "footnoteLayer" + str(footnoteNum): return ret #else: #print str(element) else: sys.stdout.write("<%s>\n" % element.name) openCloseTag = True ret = 1 if hasattr(element, 'contents'): for e in element.contents: if e == "\n": continue else: ret = traverseElement(e, url, encoding) elif isinstance(element, Comment): return ret else: sys.stdout.write(element) ret = 1 return ret if openCloseTag == True: sys.stdout.write("</%s>\n" % element.name) ret = 1 return ret