def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} session = requests.Session() handle_url('http://www.arcamax.com/comics', session, res) save_result(res, json_file)
def main(args): """Get scraper descriptions from google results.""" if os.path.isfile(json_file): result = load_result(json_file) else: result = {} if args: tofind = args[0] else: tofind = None for scraperclass in sorted(get_scraperclasses(), key=classname): key = classname(scraperclass) if tofind and key != tofind: continue tofind = None if '_' in key: continue print(key) if scraperclass.description: continue if key in result: continue url = get_scraper_url(scraperclass) print(url) lang = scraperclass.lang description = get_description(url, lang) if description: print(description) # store result module = scraperclass.__module__ result[key] = dict(description=description, module=module, url=url) save_result(result, json_file) else: print("No description found") return 0
def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} session = requests.Session() base = "http://keenspot.com/" handle_url(base, session, res) save_result(res, json_file)
def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} session = requests.Session() base = 'http://keenspot.com/' handle_url(base, session, res) save_result(res, json_file)
def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} session = requests.Session() base = 'http://guide.comicgenesis.com/Keenspace_%s.html' for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ': handle_url(base % c, session, res) save_result(res, json_file)
def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} session = requests.Session() handle_url('http://www.gocomics.com/features', session, res) handle_url('http://www.gocomics.com/explore/editorial_list', session, res) handle_url('http://www.gocomics.com/explore/sherpa_list', session, res) save_result(res, json_file)
def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} session = requests.Session() base = "http://guide.comicgenesis.com/Keenspace_%s.html" for c in "0ABCDEFGHIJKLMNOPQRSTUVWXYZ": handle_url(base % c, session, res) save_result(res, json_file)
def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} session = requests.Session() baseUrl = 'http://comicfury.com/search.php?search=1&webcomics=Search+for+webcomics&query=&worder=5&asc=1&incvi=1&incse=1&incnu=1&incla=1&all_ge=1&all_st=1&all_la=1&page=' pages = 382 for i in range(1, pages + 1): url = baseUrl + str(i) handle_url(url, session, res) save_result(res, json_file)
def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} session = requests.Session() baseUrl = 'http://comicfury.com/search.php?search=1&webcomics=Search+for+webcomics&query=&worder=5&asc=1&incvi=1&incse=1&incnu=1&incla=1&all_ge=1&all_st=1&all_la=1&page=' pages = 382 for i in range(1, pages+1): url = baseUrl + str(i) handle_url(url, session, res) save_result(res, json_file)
def get_results(): """Parse all search result pages.""" base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d" session = requests.Session() # store info in a dictionary {name -> url, number of comics, adult flag, bounce flag} res = {} # a search for an empty string returned 286 result pages result_pages = 286 print("Parsing", result_pages, "search result pages...", file=sys.stderr) for i in range(0, result_pages): print(i+1, file=sys.stderr, end=" ") handle_url(base % (i*12), session, res) save_result(res, json_file)
def get_results(): """Parse all search result pages.""" base = "http://www.drunkduck.com/search/?page=%d&search=&type=0&type=1&last_update=" href = re.compile(tagre("a", "href", r'(/[^"]+/)', before="size24 yanone blue")) num = re.compile(r'(\d+) pages?</span>') # store info in a dictionary {name -> number of comics} res = {} # a search for an empty string returned 825 result pages result_pages = 825 print("Parsing", result_pages, "search result pages...", file=sys.stderr) session = requests.Session() for i in range(1, result_pages + 1): print(i, file=sys.stderr, end=" ") handle_url(base % i, session, href, num, res) save_result(res, json_file)
def get_results(): """Parse all search result pages.""" base = "http://www.theduckwebcomics.com/search/?page=%d&search=&type=0&type=1&last_update=" href = re.compile(tagre("a", "href", r'(/[^"]+/)', before="size24 yanone blue")) num = re.compile(r'(\d+) pages?</span>') # store info in a dictionary {name -> number of comics} res = {} # a search for an empty string returned 825 result pages result_pages = 825 print("Parsing", result_pages, "search result pages...", file=sys.stderr) session = requests.Session() for i in range(1, result_pages + 1): print(i, file=sys.stderr, end=" ") handle_url(base % i, session, href, num, res) save_result(res, json_file)
def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} session = requests.Session() # Sort by page count, so we can abort when we get under some threshold. baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&query=' + '&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&all_ge=1' + '&all_st=1&all_la=1&page=%d') last_count = 999 page = 1 print("Parsing search result pages...", file=sys.stderr) while last_count >= MIN_COMICS: last_count = handle_url(baseUrl % page, session, res) page += 1 print(last_count, file=sys.stderr, end=" ") save_result(res, json_file)
def main(args): """Get scraper descriptions from google results.""" if os.path.isfile(json_file): result = load_result(json_file) else: result = {} for classname, info in sorted(result.items()): if has_description(classname) or '_' in classname: continue if info.get('answer') == 'no': continue if not answer(classname, info): info['answer'] = 'no' save_result(result, json_file) continue filename = info['module'].replace('.', os.sep) + ".py" encoding = get_encoding(filename) with codecs.open(filename, 'r', encoding) as f: with codecs.open(filename + "_", 'w', encoding) as out: write_description(f, out, classname, info) os.rename(filename + "_", filename) return 0
def main(args): """Get scraper descriptions from google results.""" if os.path.isfile(json_file): result = load_result(json_file) else: result = {} for classname, info in sorted(result.items()): if has_description(classname) or '_' in classname: continue if info.get('answer') == 'no': continue if not answer(classname, info): info['answer'] = 'no' save_result(result, json_file) continue filename = info['module'].replace('.', os.sep) + ".py" encoding = get_encoding(filename) with codecs.open(filename, 'r', encoding) as f: with codecs.open(filename+"_", 'w', encoding) as out: write_description(f, out, classname, info) os.rename(filename+"_", filename) return 0