def process_page(query_result): print "processing page %s " % query_result.href html_text = utils.download_page(query_result.href, timeout=2) dates = extract_dates(html_text) dates = [d for d in dates if d.dateRange.start >= date.today()] if not dates: return [] print "found dates", dates rex = re.compile(query_result.title, re.I | re.UNICODE | re.MULTILINE) candidate_date = dates[0] min_dist = 10000000 i = 0 while i != -1 and i < len(html_text): match = rex.match(html_text, i) if match: s = match.start() for d in dates: if math.abs(d.startPos - s) < min_dist: min_dist = math.abs(d.startPos - s) candidate_date = d i = match.end() + 1 else: break print "candidate", candidate_date return [Event(query_result.title, candidate_date.dateRange.start, candidate_date.dateRange.end)]
def _get_total_posts(self): url = self.base_url + "0&num=1" data = utils.download_page(url) if data: self.total_posts = int(self.total_post_re.findall(data)[0]) if self.max_posts: self.total_posts = min(self.total_posts, self.max_posts) limit_start = self.limit_start while limit_start < self.total_posts: self.post_queue.put(limit_start) limit_start += self.num
def _get_img_urls(self): while not self.post_queue.empty(): limit_start = self.post_queue.get() url = self.base_url + str(limit_start) + "&num=" + str(self.num) data = utils.download_page(url, proxies=self.proxies) if data: imgs = self.img_re.findall(data) for img in imgs: img = img.replace('\\', '') if not self.need_save: self.imglog.info("%s" % img) else: self.img_queue.put(img)
def get_single_chapter(name, chapter, url): folder = os.path.join(name.replace(' ', '_'), "ch{}".format(chapter)) utils.mkdir_p(folder) for page, img in _get_pages(url): utils.download_page(folder, page, img) print 'making cbz for', folder
for (rex, format) in zip(rexs, formats): start_pos = 0 n = len(string) # print format while 0 <= start_pos < n: match = rex.search(string, start_pos) if match: # print "\t" + match.group() try: dates.append(DateInfo(match.group(1), format, match.start(1), match.end(1))) except: pass start_pos = match.end() + 1 else: break dates = [d for d in dates if d.dateRange.start <= date(3000, 1, 1)] return dates # for testing from utils import download_page if __name__ == "__main__": pp = pprint.PrettyPrinter(indent=4) page_text = download_page( "http://starforce.eu/") date_strs = extract_dates(page_text) for date_str in date_strs: print date_str.startPos, date_str.endPos