def get_results(): """Parse all search result pages.""" base = "http://www.drunkduck.com/search/?page=%d&search=&type=0&type=1&last_update=" href = re.compile(tagre("a", "href", r'(/[^"]+/)', before="size24 yanone blue")) num = re.compile(r'(\d+) pages?</span>') # store info in a dictionary {name -> number of comics} res = {} # a search for an empty string returned 825 result pages result_pages = 825 print("Parsing", result_pages, "search result pages...", file=sys.stderr) for i in range(1, result_pages + 1): print(i, file=sys.stderr, end=" ") handle_url(base % i, href, num, res) save_result(res)
def get_results(): """Parse all search result pages.""" base = "http://www.drunkduck.com/search/?page=%d&search=&type=0&type=1&last_update=" href = re.compile(tagre("a", "href", r'(/[^"]+/)', before="size24 yanone blue")) num = re.compile(r'(\d+) pages?</span>') # store info in a dictionary {name -> number of comics} res = {} # a search for an empty string returned 825 result pages result_pages = 825 print("Parsing", result_pages, "search result pages...", file=sys.stderr) session = requests.Session() for i in range(1, result_pages + 1): print(i, file=sys.stderr, end=" ") handle_url(base % i, session, href, num, res) save_result(res, json_file)
def test_regex(self): matcher = re.compile(tagre("img", "src", '(%s[^"]*)' % self.ValuePrefix)) for tag, value, domatch in self.TagTests: self.match_tag(matcher, tag, value, domatch)
from __future__ import print_function import codecs import re import sys import os import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt from dosagelib.scraper import get_scraperclasses from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name json_file = __file__.replace(".py", ".json") # <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a> url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r"([^<]+)</a>") num_matcher = re.compile(r"Number of Days: (\d+)") # names of comics to exclude exclude_comics = [ "10", # page is gone "54sinRed", # page is 403 forbidden "6D4", # redirected to another page "AaaSoCAwesomenessandaSliceofCheese", # broken images "AcrossthePond", # page moved "ACDeceptibotscomic", # no images "AdamandSei", # page has 403 forbidden "AdamsRoadGang", # page is gone "ADVENTURERS", # page is gone "AiYaiYai", # page moved "AlltheCommies", # missing images
import sys import os import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa from dosagelib.util import get_page, tagre, check_robotstxt from dosagelib.scraper import get_scrapers from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name json_file = __file__.replace(".py", ".json") url_matcher = re.compile( tagre("td", "onmouseover", r'([^"]+)') + tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') + r"(?:<b>)?([^<]+)(?:</b>)?</a>" ) # names of comics to exclude exclude_comics = [ "BrawlintheFamily", # non-standard navigation "CrowScare", # non-standard navigation "Dreamless", # non-standard navigation "EV", # non-standard navigation "Exposure", # non-standard navigation "Flipside", # non-standard navigation "HerobyNight", # non-standard navigation "JadeWarriors", # non-standard navigation
def test_regex(self): matcher = re.compile( tagre("img", "src", '(%s[^"]*)' % self.ValuePrefix)) for tag, value, domatch in self.TagTests: self.match_tag(matcher, tag, value, domatch)
""" from __future__ import print_function import codecs import re import sys import os import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt from dosagelib.scraper import get_scraperclasses from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name json_file = __file__.replace(".py", ".json") url_matcher = re.compile( tagre("td", "onmouseover", r'([^"]+)') + tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') + r"(?:<b>)?([^<]+)(?:</b>)?</a>") # names of comics to exclude exclude_comics = [ "BrawlintheFamily", # non-standard navigation "CrowScare", # non-standard navigation "Dreamless", # non-standard navigation "EV", # non-standard navigation "Exposure", # non-standard navigation "Flipside", # non-standard navigation "HerobyNight", # non-standard navigation "JadeWarriors", # non-standard navigation "LastBlood", # non-standard navigation "MysticRevolution", # non-standard navigation
""" from __future__ import print_function import codecs import re import sys import os import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import tagre, getPageContent, asciify, unescape from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name json_file = __file__.replace(".py", ".json") #<a href="/shortname" class="alpha_list updated">name</a> url_matcher = re.compile( tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>") # names of comics to exclude exclude_comics = [ "Angryprogrammer", # unavailable "Complex", # "coming soon" "Guinness", # "coming soon" "Jabberwoncky", # "coming soon" "KickyBrand", # unavailable "Penmanship", # unavailable "RandysRationale", # "coming soon" "SaturdayMorningBreakfastCereal", # duplicate "SignsOfOurTimes", # "coming soon" "TheGagwriter", # "coming soon" "Yaoyao", # "coming soon" ]
""" from __future__ import print_function import re import codecs import sys import os import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import getPageContent, asciify, unescape, tagre from dosagelib.scraper import get_scraperclasses from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name json_file = __file__.replace(".py", ".json") url_matcher = re.compile( tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<strong>([^<]+)</strong>') # names of comics to exclude exclude_comics = [] def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data, baseUrl = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1)
""" from __future__ import print_function import codecs import re import sys import os import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt from dosagelib.scraper import get_scraperclasses from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name json_file = __file__.replace(".py", ".json") # <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a> url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>') num_matcher = re.compile(r'Number of Days: (\d+)') # names of comics to exclude exclude_comics = [ "10", # page is gone "54sinRed", # page is 403 forbidden "6D4", # redirected to another page "AaaSoCAwesomenessandaSliceofCheese", # broken images "AcrossthePond", # page moved "ACDeceptibotscomic", # no images "AdamandSei", # page has 403 forbidden "AdamsRoadGang", # page is gone "ADVENTURERS", # page is gone "AiYaiYai", # page moved "AlltheCommies", # missing images
def test_regex(self, tag, value, domatch): matcher = re.compile( tagre("img", "src", '(%s[^"]*)' % self.ValuePrefix)) self.match_tag(matcher, tag, value, domatch)
Script to get a list of creators.com comics and save the info in a JSON file for further processing. """ from __future__ import print_function import re import codecs import sys import os import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import getPageContent, asciify, unescape, tagre from dosagelib.scraper import get_scraperclasses from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name json_file = __file__.replace(".py", ".json") url_matcher = re.compile(tagre("a", "href", r'/comics/([^/]+)\.html') + r'<strong>([^<]+)</strong>') # names of comics to exclude exclude_comics = [ ] def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1)
Script to get a list of gocomics and save the info in a JSON file for further processing. """ from __future__ import print_function import codecs import re import sys import os import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import tagre, getPageContent, asciify, unescape from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name json_file = __file__.replace(".py", ".json") #<a href="/shortname" class="alpha_list updated">name</a> url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>") # names of comics to exclude exclude_comics = [ "Adagio", # too few comics "AgentGates", # too few comics "Apocalypseharry", # too few comics "BatkidandBatrat", # too few comics "BETWEENTHELINES", # comic unavailable "Bonner", # missing page "Buster", # comic unavailabe "CarteBlanche", # missing images "Critterdoodles", # missing images "CountyLine", # too few comics "Crawdiddy", # comic unavailable "DALTONDOG", # comic unavailable
import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa from dosagelib.util import get_page, tagre, check_robotstxt from dosagelib.scraper import get_scrapers from scriptutil import (contains_case_insensitive, save_result, load_result, truncate_name, format_name) json_file = __file__.replace(".py", ".json") # <div class="comictitle"><strong><a target="_blank" # onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return # false;" href="http://collegepros.comicgenesis.com">Adventures of the College # Pros</a> url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>') num_matcher = re.compile(r'Number of Days: (\d+)') # names of comics to exclude exclude_comics = [ "10", # page is gone "54sinRed", # page is 403 forbidden "6D4", # redirected to another page "AaaSoCAwesomenessandaSliceofCheese", # broken images "AcrossthePond", # page moved "ACDeceptibotscomic", # no images "AdamandSei", # page has 403 forbidden "AdamsRoadGang", # page is gone "ADVENTURERS", # page is gone "AiYaiYai", # page moved
"TPTruePower", "TwoKeys", "UndertheSkin", "WelcometoFreakshow", "Whenweweresilent", "WhiteHeart", "Yaoishereforareason", "Zodiac", ] # links to last valid strips url_overrides = { } # HTML content matcher page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', after="site_banner") + tagre("img", "title", r'([^"]+)')) url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic") num_matcher = re.compile(r'50%">\s+(\d+)\s+') adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png')) def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = getPageContent(url, session) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in page_matcher.finditer(data): page_url = match.group(1)
""" from __future__ import print_function import re import sys import os import requests sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt from dosagelib.scraper import get_scraperclasses from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name json_file = __file__.replace(".py", ".json") # <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a> url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>') num_matcher = re.compile(r'Number of Days: (\d+)') # names of comics to exclude exclude_comics = [ "10", # page is gone "54sinRed", # page is 403 forbidden "6D4", # redirected to another page "AaaSoCAwesomenessandaSliceofCheese", # broken images "AcrossthePond", # page moved "ACDeceptibotscomic", # no images "AdamandSei", # page has 403 forbidden "AdamsRoadGang", # page is gone "ADVENTURERS", # page is gone "AiYaiYai", # page moved
Script to get a list of gocomics and save the info in a JSON file for further processing. """ from __future__ import print_function import re import sys import os import json sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from dosagelib.util import tagre, getPageContent, asciify, unescape from dosagelib.scraper import get_scrapers from scriptutil import contains_case_insensitive, capfirst json_file = __file__.replace(".py", ".json") #<a href="/shortname" class="alpha_list updated">name</a> url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>") # names of comics to exclude exclude_comics = [ "FrikkFrakkAndFrank", # too few comics "Apocalypseharry", # too few comics "BatkidandBatrat", # too few comics "BETWEENTHELINES", # comic unavailable "Bonner", # missing page "Buster", # comic unavailabe "DALTONDOG", # comic unavailable "DellAndSteve", # too few comics "Dilbert", # redirect "InkeeDoodles", # comic unavailable "MaggiesComics", # too few comics "OfMiceandMud", # too few comics
def test_regex(self, tag, value, domatch): matcher = re.compile(tagre("img", "src", '(%s[^"]*)' % self.ValuePrefix)) self.match_tag(matcher, tag, value, domatch)