def main(): link = "" title = "" state = 0 resultList = [] urlPrefix = "http://m.tstore.co.kr/mobilepoc" for line in feedmakerutil.readStdinAsLineList(): if state == 0: m = re.search(r''' <a [^>]* goInnerUrlDetail\( \\ ' (?P<url>/webtoon/webtoonList[^']+) \\ ''', line, re.VERBOSE) if m: link = urlPrefix + m.group("url") link = re.sub(r'&', '&', link) state = 1 elif state == 1: m = re.search(r''' <dt> (?P<title>.+) </dt> ''', line, re.VERBOSE) if m: title = m.group("title") resultList.append((link, title)) state = 0 for link, title in resultList[:5]: print("%s\t%s" % (link, title))
def main(): clipNoList = [] volumeNo = None memberNo = None m = re.search(r'volumeNo=(?P<volumeNo>\d+)&memberNo=(?P<memberNo>\d+)', sys.argv[1]) if m: volumeNo = int(m.group("volumeNo")) memberNo = int(m.group("memberNo")) lineList = feedmakerutil.readStdinAsLineList() for line in lineList: line = line.rstrip() print(line) failureCount = 0 link_prefix = "http://m.post.naver.com/viewer/clipContentJson.nhn?volumeNo=%d&memberNo=%d&clipNo=" % (volumeNo, memberNo) for clipNo in range(30): link = link_prefix + str(clipNo) cmd = 'wget.sh "%s" utf8' % (link) #print(cmd) result = feedmakerutil.execCmd(cmd) if result == False or re.search(r'"notExistClip"', result): failureCount = failureCount + 1 if failureCount > 2: break else: clipContent = json.loads(result)["clipContent"] clipContent = re.sub(r'src="{{{[^\|]*\|/([^\|]+)\|\d+|\d+}}}"', 'src="http://post.phinf.naver.net/\1"', clipContent) clipContent = re.sub(r'<img src=\'http://static.post.naver.net/image/im/end/toast_flick.png\'/>', '', clipContent) print(clipContent)
def main(): link = "" title = "" urlPrefix = "" state = 0 resultList = [] urlPrefix = "" numOfRecentFeeds = 1000 optlist, args = getopt.getopt(sys.argv[1:], "n:") for o, a in optlist: if o == '-n': numOfRecentFeeds = int(a) for line in feedmakerutil.readStdinAsLineList(): if state == 0: m = re.search(r'window\.location\.href\s*=\s*"(?P<url>[^"]+)"', line) if m: urlPrefix = + m.group("url") state = 1 elif state == 1: matches = m.findall(r'<option (?:selected="" )?value="(?P<url>\d+)">(?P<title>[^<]+)</option>', line) for match in matches: link = urlPrefix + match[1] title = match[0] resultList.append((link, title)) for (link, title) in resultList[-numOfRecentFeeds:]: print("%s\t%s" % (link, title))
def main(): state = 0 numOfRecentFeeds = 10 optlist, args = getopt.getopt(sys.argv[1:], "n:") for o, a in optlist: if o == '-n': numOfRecentFeeds = int(a) lineList = feedmakerutil.readStdinAsLineList() resultList = [] for line in lineList: if state == 0: m1 = re.search(r'"path"\s*:\s*"(?P<prefix>[^"]+)"', line) if m1: prefix = m1.group("prefix") state = 1 elif state == 1: m2 = re.search(r'{"name"\s*:\s*"(?P<link>\d[^"]+.md)"}', line) if m2: link = "http://teamsego.github.io/github-trend-kr/" + prefix + "/" + m2.group("link") title = m2.group("link") resultList.append((link, title)) m3 = re.search(r'{"volume":', line) if m3: lineList.insert(0, line) state = 0 #print(state, end=' ') for (link, title) in resultList[-numOfRecentFeeds:]: print("%s\t%s" % (link, title))
def main(): link = "" title = "" state = 0 numOfRecentFeeds = 10 optlist, args = getopt.getopt(sys.argv[1:], "n:") for o, a in optlist: if o == '-n': numOfRecentFeeds = int(a) lineList = feedmakerutil.readStdinAsLineList() resultList = [] for line in lineList: if state == 0: m = re.search(r'<li class="list viewerLinkBtn (?:pointer )?list_[HV][^>]*data-productid="(?P<id>\d+)', line) if m: productid = m.group("id") link = "http://page.kakao.com/viewer?productId=" + productid state = 1 elif state == 1: m = re.search(r'<span class="Lfloat (?:listTitle )?ellipsis">', line) if m: state = 2 elif state == 2: m = re.search(r'^\s*(?P<title>\S+.*\S+)\s*$', line) if m: title = m.group("title") title = re.sub(r'&(lt|gt);', '', title) resultList.append((link, title)) state = 0 for (link, title) in resultList[:numOfRecentFeeds]: print("%s\t%s" % (link, title))
def main(): encoding = "utf-8" isEnd = False numOfRecentFeeds = 30 optlist, args = getopt.getopt(sys.argv[1:], "n:") for o, a in optlist: if o == '-n': numOfRecentFeeds = int(a) # get a url of first list page pageContentLineList = feedmakerutil.readStdinAsLineList() resultList = printArticleUrlList(pageContentLineList) nextPageUrl, isEnd = getNextPageUrl(pageContentLineList, "articleTypeList") # get url list from first list page (iterative) while len(resultList) < numOfRecentFeeds: pageContent = getPageContent(nextPageUrl, encoding) pageContentLineList = re.split(r'\n', pageContent) resultList.extend(printArticleUrlList(pageContentLineList)) nextPageUrl, isEnd = getNextPageUrl(pageContentLineList, "cateList") if isEnd: break for (link, title) in resultList[:numOfRecentFeeds]: print("%s\t%s" % (link, title))
def main(): link = "" title = "" state = 0 resultList = [] urlPrefix = "http://music.naver.com" for line in feedmakerutil.readStdinAsLineList(): if state == 0: m = re.search(r'<a\s+class="[^"]+"\s+href=\"(?P<url>/todayMusic/index\.nhn[^\"]+)\"[^>]*>', line) if m: link = urlPrefix + m.group("url") link = re.sub(r'&', '&', link) state = 1 elif state == 1: m = re.search(r'<span\s+class="[^"]+"\s+title=\"[^\"]+\"[^>]*>(?P<title>[^<]+)</span>', line) if m: title = m.group("title") resultList.append((link, title)) state = 0 count = 0 for link, title in resultList: print("%s\t%s" % (link, title)) count += 1 if count >= 7: break
def main(): secondPageUrl = "" for line in feedmakerutil.readStdinAsLineList(): line = line.rstrip() m = re.search( r"<a href='(?P<secondPageUrl>http://[^']+)'[^>]*><img src='http://cwstatic\.asiae\.co\.kr/images/cartoon/btn_s\.gif'/>", line, ) if m: secondPageUrl = m.group("secondPageUrl") else: m = re.search(r"<a href='(?P<secondPageUrl>http://stoo.asiae.co.kr/cartoon/view.htm[^']*)'>2페이지</a>", line) if m: secondPageUrl = m.group("secondPageUrl") else: m = re.search(r"<img src='(?P<imgUrl>http://cwcontent[^']+)'.*/>", line) if m: imgUrl = m.group("imgUrl") print("<img src='%s' width='100%%'/>" % (imgUrl)) if secondPageUrl != "": cmd = "wget.sh '%s' | extract_element.py extraction" % (secondPageUrl) # print(cmd) result = feedmakerutil.execCmd(cmd) if result: for line in result.split("\n"): m = re.search(r"<img\s*[^>]*src=(?:\'|\")(?P<imgUrl>http://cwcontent[^\'\"]+)(?:\'|\").*/>", line) if m: imgUrl = m.group("imgUrl") print("<img src='%s' width='100%%'/>" % (imgUrl))
def main(): link = "" title = "" icon = "" state = 0 lineList = feedmakerutil.readStdinAsLineList() for line in lineList: if state == 0: m1 = re.search(r'<a href="(?P<link>[^"]+mag_id=\d+)">', line) if m1: link = m1.group("link") link = re.sub(r'&', '&', link) link = "http://www.cine21.com" + link state = 1 elif state == 1: m2 = re.search(r'<span class="tit">(?P<title>[^<]+)</span>', line) if m2: title = m2.group("title") if re.search(r'(대학교|영화학|모집)', title): continue title = re.sub(r'<', '"', title) title = re.sub(r'>', '"', title) print("%s\t%s" % (link, title)) state = 0
def main(): link_prefix = "http://cartoon.media.daum.net/m/webtoon/viewer/" link = "" title = "" nickname = "" numOfRecentFeeds = 30 optlist, args = getopt.getopt(sys.argv[1:], "n:") for o, a in optlist: if o == "-n": numOfRecentFeeds = int(a) lineList = feedmakerutil.readStdinAsLineList() resultList = [] for line in lineList: p = re.compile(r'"id":(?P<id>\d+),"episode":(?P<episode>\d+),"title":"(?P<title>[^"]+)",') for m in p.finditer(line): md = m.groupdict() id = md["id"] episode = int(md["episode"]) title = md["title"] link = link_prefix + id title = "%04d. %s" % (episode, title) resultList.append((link, title)) for (link, title) in resultList[:numOfRecentFeeds]: print("%s\t%s" % (link, title))
def main(): link = "" title = "" urlPrefix = "http://m.tstore.co.kr/mobilepoc/webtoon/webtoonDetail.omp?prodId=" resultList = [] numOfRecentFeeds = 1000 count = 0 optlist, args = getopt.getopt(sys.argv[1:], "n:") for o, a in optlist: if o == '-n': numOfRecentFeeds = int(a) for line in feedmakerutil.readStdinAsLineList(): matches = re.findall(r''' [^}]*, "prodId"\s*:\s*"\s*([^"]+)\s*", [^}]*, "prodNm"\s*:\s*"\s*([^"]+)\s*", ''', line, re.VERBOSE) for match in matches: link = urlPrefix + match[0] title = match[1] if link and title: resultList.append((link, title)) for (link, title) in resultList[-numOfRecentFeeds:]: print("%s\t%s" % (link, title))
def main(): link = "" title = "" episodeNum = "" state = 0 urlPrefix = "http://sports.chosun.com/cartoon" resultList = [] numOfRecentFeeds = 1000 optlist, args = getopt.getopt(sys.argv[1:], "n:") for o, a in optlist: if o == '-n': numOfRecentFeeds = int(a) for line in feedmakerutil.readStdinAsLineList(): if state == 0: m = re.search(r'<li class="section1">(?P<episodeNum>[^<]+)</li>', line) if m: episodeNum = m.group("episodeNum") state = 1 elif state == 1: m = re.search(r'<li class="section2"><a href="(?P<url>[^"]+)">\[[^\]]+\] (?P<title>[^<]+)</a></li>', line) if m: url = m.group("url") title = m.group("title") link = re.sub(r'&', '&', link) link = urlPrefix + url title = episodeNum + " " + title resultList.append((link, title)) for link, title in resultList[-numOfRecentFeeds:]: print("%s\t%s" % (link, title))
def main(): link = "" episodeNum = "" title = "" state = 0 urlPrefix = "http://comics.nate.com" for line in feedmakerutil.readStdinAsLineList(): if state == 0: m = re.search(r'<a href="(?P<url>/webtoon/detail\.php\?[^"]+)&category=\d+">', line) if m: url = urlPrefix + m.group("url") link = re.sub(r"&", "&", url) state = 1 elif state == 1: m = re.search(r'<span class="tel_episode">(?P<episodeNum>.+)</span>', line) if m: episodeNum = m.group("episodeNum") state = 2 elif state == 2: m = re.search(r'<span class="tel_title">(?P<title>.+)</span>', line) if m: title = episodeNum + " " + m.group("title") print("%s\t%s" % (link, title)) state = 0
def main(): imgHost = "" imgPath = "" imgIndex = -1 imgExt = "jpg" pageUrl = sys.argv[1] for line in feedmakerutil.readStdinAsLineList(): line = line.rstrip() if re.search(r"<(meta|style)", line): print(line) else: m = re.search(r"<img src='http://(?P<imgHost>imgcomic.naver.(?:com|net))/(?P<imgPath>[^']+_)(?P<imgIndex>\d+)\.(?P<imgExt>jpg|gif)", line, re.IGNORECASE) if m: imgHost = m.group("imgHost") imgPath = m.group("imgPath") imgIndex = int(m.group("imgIndex")) imgExt = m.group("imgExt") print("<img src='http://%s/%s%d.%s' width='100%%'/>" % (imgHost, imgPath, imgIndex, imgExt)) else: m = re.search(r"<img src='http://(?P<imgHost>imgcomic.naver.(?:com|net))/(?P<imgPath>[^']+)\.(?P<imgExt>jpg|gif)", line, re.IGNORECASE) if m: imgHost = m.group("imgHost") imgPath = m.group("imgPath") imgExt = m.group("imgExt") print("<img src='http://%s/%s.%s' width='100%%'/>" % (imgHost, imgPath, imgExt)) if imgPath != "" and imgIndex >= 0: # add some additional images loaded dynamically for i in range(60): imgUrl = "http://%s/%s%d.%s" % (imgHost, imgPath, i, imgExt) cmd = 'wget.sh --spider --referer "%s" "%s"' % (pageUrl, imgUrl) result = feedmakerutil.execCmd(cmd) if result: print("<img src='http://%s/%s%d.%s' width='100%%'/>" % (imgHost, imgPath, i, imgExt))
def main(): link = "" title = "" linkPrefix = "http://comic.mt.co.kr/" for line in feedmakerutil.readStdinAsLineList(): m = re.search(r''' <li> <a\s+href="\./(?P<link>comicView\.htm[^"]+)"> <img\s+height="\d+"\s+src="[^"]+"\s+width="\d+"\s*/> </a> <p\ class="[^"]+"> (?P<title1>[^<]+) <br/?> <b> (?P<title2>[^<]*) </b> (?:</br>)? </p> </li> ''', line, re.VERBOSE) if m: link = linkPrefix + m.group("link") title = m.group("title1") + " " + m.group("title2") link = re.sub(r"&", "&", link) link = re.sub(r"&nPage=[^&]*", "", link) print("%s\t%s" % (link, title))
def main(): link = "" title = "" url = "" cartoonId = 0 episodeId = 0 episodeNum = 0 state = 0 urlPrefix = "http://sports.khan.co.kr" resultList = [] numOfRecentFeeds = 1000 optlist, args = getopt.getopt(sys.argv[1:], "n:") for o, a in optlist: if o == '-n': numOfRecentFeeds = int(a) for line in feedmakerutil.readStdinAsLineList(): if state == 0: m = re.search(r'document\.location\.href\s*=\s*"(?P<url>[^"]+)"', line) if m: url = m.group("url") state = 1 elif state == 1: matches = re.findall(r"<option value='(?P<cartoonId>\d+)\|(?P<episodeId>\d+)(?:\|\d+)?'(?: selected)?>(?P<title>[^<]+)</option>", line) for match in matches: cartoonId = match[0] episodeId = match[1] title = match[2] link = urlPrefix + url + cartoonId + "&page=" + episodeId link = re.sub(r'&', '&', link) resultList.append((link, title)) for (link, title) in resultList[-numOfRecentFeeds:]: print("%s\t%s" % (link, title))
def main(): link = "" urlPrefix = "http://navercast.naver.com/" title = "" state = 0 for line in feedmakerutil.readStdinAsLineList(): if state == 0: m = re.search(r'<a class="card" href="/?(?P<url>contents\.nhn\?contents_id=\d+[^"]*)"[^>]*>', line) if m: url = m.group("url") url = re.sub(r"&", "&", url) link = urlPrefix + url state = 1 elif state == 1: m = re.search(r'<strong>(?P<mainTitle>[^<]+)</strong><span(?: alt="[^"]+" title="(?P<subTitle1>[^"]+)")?>(?P<subTitle2>.*)</span>', line) if m: if m.group("subTitle1"): title = m.group("mainTitle") + " - " + m.group("subTitle1") else: title = m.group("mainTitle") + " - " + m.group("subTitle2") print("%s\t%s" % (link, title)) state = 0 link = "" title = ""
def main(): pageUrl = sys.argv[1] urlPrefix = "" dataUrl = "" for line in feedmakerutil.readStdinAsLineList(): m = re.search(r"jpg: '(?P<urlPrefix>[^']+\/){=filename}\?type=[^']+'", line) if m: urlPrefix = m.group("urlPrefix") else: m = re.search(r"documentURL:\s*'(?P<dataUrl>[^']+)'", line) if m: dataUrl = m.group("dataUrl") break if not dataUrl or urlPrefix: die("can't get a data url from input") cmd = "wget.sh '%s' utf8 | gunzip 2> /dev/null || wget.sh '%s' utf8" % (dataUrl, dataUrl) #print(cmd); with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) as p: for line in p.stdout: line = line.rstrip() matches = re.findall(r"[^\"]+\"\s*:\s*\"(assets/still/[^\"]+\.(?:png|jpg))\"", line) for match in matches: imgUrl = urlPrefix + match[0] print("<img src='%s' width='100%'/>" % (imgUrl))
def main(): lineList = feedmakerutil.readStdinAsLineList() for line in lineList: matches = re.findall(r'nickname":"([^"]+)"[^"]*"title":"([^"]+)', line) for match in matches: link = "http://cartoon.media.daum.net/webtoon/view/" + match[0] title = match[1] print("%s\t%s" % (link, title))
def main(): link = "" title = "" urlPrefix = "http://sports.news.naver.com" for line in feedmakerutil.readStdinAsLineList(): m = re.search(r'<a href="(?P<url>/magazineS/index\.nhn\?id=\d+)">(?P<title>[^<]+)</a>', line) if m: link = urlPrefix + m.group("url") title = m.group("title") print("%s\t%s" % (link, title))
def main(): link = "" title = "" urlPrefix = "http://sports.donga.com/cartoon" for line in feedmakerutil.readStdinAsLineList(): m = re.search(r'<li(?: class="first")?><a href="(?P<url>\?cid=[^"]+)"><img alt="(?P<title>[^"]+)"', line) if m: link = urlPrefix + m.group("url") title = m.group("title") print("%s\t%s" % (link, title))
def main(): link = "" title = "" for line in feedmakerutil.readStdinAsLineList(): m = re.search(r'<strong><a href="(?P<url>http://sports.khan.co.kr/comics/cartoon_view.html\?comics=b2c&sec_id=\d+)">(?P<title>[^<]+)</a></strong>', line) if m: link = m.group("url") title = m.group("title") link = re.sub(r'&', '&', link) print("%s\t%s" % (link, title))
def main(): urlDomain = getUrlDomainFromConfig(); lineList = feedmakerutil.readStdinAsLineList() for line in lineList: matches = re.findall(r'<a href="/(\d+)"[^>]*>([^<]*)</a>', line) for match in matches: link = urlDomain + match[0] title = match[1] print("%s\t%s" % (link, title))
def main(): link = "" title = "" linkPrefix = "http://movie.daum.net/movieinfo/news/" for line in feedmakerutil.readStdinAsLineList(): m = re.search(r"<h5><a\s+class=\"[^\"]+\"\s+href=\"(?P<link>movieInfoArticleRead\.do[^\"]+)\"[^>]*>(?P<title>.+)</a></h5>", line) if m: link = linkPrefix + m.group("link") link = re.sub(r"&", "&", link) title = m.group("title") print("%s\t%s" % (link, title))
def main(): link = "" title = "" state = 0 urlPrefix = "http://comic.naver.com" for line in feedmakerutil.readStdinAsLineList(): m = re.search(r'<a href="(?P<url>/webtoon/list.nhn\?titleId=\d+)[^"]*"[^>]*><img[^>]*title="(?P<title>[^"]+)"[^>]*/?>', line) if m: url = m.group("url") title = m.group("title") link = urlPrefix + url print("%s\t%s" % (link, title))
def main(): state = 0 for line in feedmakerutil.readStdinAsLineList(): line = line.rstrip() if state == 0: pat = r"<img src=\"http://imgnews.naver.(com|net)/image/sports/\d+/magazineS/magazine_content/magazineS_\d+/\d+_file_image_0.jpg\" width=\"67\" height=\"9\" alt=\"[^\"]+\" />" m = re.search(pat, line) if m: line = re.sub(pat, "PEOPLE", line) state = 1 elif state == 1: pat = r"<img src=\"http://imgnews.naver.(com|net)/image/sports/\d+/magazineS/magazine_content/magazineS_\d+/\d+_file_image_0.jpg\" width=\"102\" height=\"10\" alt=\"[^\"]+\" />" m = re.search(pat, line) if m: line = re.sub(pat, "SPECIAL REPORT", line) state = 2 elif state == 2: pat = r"<img src=\"http://imgnews.naver.(com|net)/image/sports/\d+/magazineS/magazine_content/magazineS_\d+/\d+_file_image_0.jpg\" width=\"57\" height=\"10\" alt=\"[^\"]+\" />" m = re.search(pat, line) if m: line = re.sub(pat, "COLUMN", line) state = 3 elif state == 3: pat = r"<h2><img src=\"http://imgnews.naver.(com|net)/image/sports/\d+/magazineS/magazine_content/magazineS_\d+/\d+_file_image_0.jpg\" width=\"58\" height=\"10\" alt=\"[^\"]+\" /></h2>.*" m = re.search(pat, line) if m: line = re.sub(pat, "", line) state = 4 elif state == 4: pat = r"<img src=\"http://imgnews.naver.(com|net)/image/sports/\d+/magazineS/magazine_content/magazineS_\d+/\d+_file_image_0.jpg\" width=\"29\" height=\"10\" alt=\"[^\"]+\" />" m = re.search(pat, line) if m: line = re.sub(pat, "POLL", line) state = 5 else: continue elif state == 5: pat = r"^<div><ul>" m = re.search(pat, line) if m: line = re.sub(pat, "<h2>ZOOM IN</h2><ul>", line) state = 6 elif state == 6: pat = r"^</li></ul></div>" m = re.search(pat, line) if m: line = re.sub(pat, "</li></ul>", line) break print(line)
def main(): link = "" title = "" urlPrefix = "http://sports.chosun.com/cartoon/sub_list.htm?title=" for line in feedmakerutil.readStdinAsLineList(): m = re.search(r'<li><a href="[^"]*title=(?P<url>[^"&]+)[^"]*"><img alt="(?P<title>[^"]+)"', line) if m: if m.group("title") == "coin": # 오늘의운세는 skip continue link = urlPrefix + m.group("url") title = m.group("title") link = re.sub(r'&', '&', link) print("%s\t%s" % (link, title))
def main(): link = "" title = "" urlPrefix = "http://stoo.asiae.co.kr" for line in feedmakerutil.readStdinAsLineList(): m = re.search(r'<a href="(?P<url>/cartoon/list.htm\?sec=\d+)"', line) if m: url = m.group("url") link = re.sub(r'&', '&', link) link = urlPrefix + url else: m = re.search(r'<dt class="desc">(?P<title>[^<]+)</dt>', line) if m: title = m.group("title") print("%s\t%s" % (link, title))
def main(): link = "" title = "" urlPrefix = "http://blog.naver.com/PostView.nhn?blogId=" for line in feedmakerutil.readStdinAsLineList(): m = re.search(r'"blogId"\s*:\s*"(?P<blogId>[^"]+)"', line) if m: urlPrefix = urlPrefix + m.group("blogId") + "&logNo=" matches = re.findall(r'"logNo":"(\d+)","title":"([^"]+)",', line) for match in matches: logNo = match[0] title = urllib.parse.unquote(match[1]) title = re.sub(r"\+", " ", title) title = re.sub(r""", "'", title) title = re.sub(r"&(lt|gt);", "", title) link = urlPrefix + logNo print("%s\t%s" % (link, title))
def main(): link = "" title = "" state = 0 urlPrefix = "http://comics.nate.com" for line in feedmakerutil.readStdinAsLineList(): if state == 0: m = re.search(r'<a class="wtl_toon" href="(?P<url>/webtoon/list.php\?btno=\d+)[^"]*">', line) if m: link = 1 link = urlPrefix + m.group("url") state = 1 elif state == 1: m = re.search(r'<span class="wtl_img"><i></i><img alt="(?P<title>[^"]+)"', line) if m: title = m.group("title") print("%s\t%s" % (link, title)) state = 0