def process_page(query_result):
    print "processing page %s " % query_result.href
    html_text = utils.download_page(query_result.href, timeout=2)
    dates = extract_dates(html_text)
    dates = [d for d in dates if d.dateRange.start >= date.today()]

    if not dates:
        return []

    print "found dates", dates

    rex = re.compile(query_result.title, re.I | re.UNICODE | re.MULTILINE)

    candidate_date = dates[0]
    min_dist = 10000000
    i = 0
    while i != -1 and i < len(html_text):
        match = rex.match(html_text, i)
        if match:
            s = match.start()
            for d in dates:
                if math.abs(d.startPos - s) < min_dist:
                    min_dist = math.abs(d.startPos - s)
                    candidate_date = d

            i = match.end() + 1
        else:
            break

    print "candidate", candidate_date
    return [Event(query_result.title, candidate_date.dateRange.start, candidate_date.dateRange.end)]
Example #2
0
 def _get_total_posts(self):
     url = self.base_url + "0&num=1"
     data = utils.download_page(url)
     if data:
         self.total_posts = int(self.total_post_re.findall(data)[0])
         if self.max_posts:
             self.total_posts = min(self.total_posts, self.max_posts)
         limit_start = self.limit_start
         while limit_start < self.total_posts:
             self.post_queue.put(limit_start)
             limit_start += self.num
Example #3
0
 def _get_total_posts(self):
     url = self.base_url + "0&num=1"
     data = utils.download_page(url)
     if data:
         self.total_posts = int(self.total_post_re.findall(data)[0])
         if self.max_posts:
             self.total_posts = min(self.total_posts, self.max_posts)
         limit_start = self.limit_start
         while limit_start < self.total_posts:
             self.post_queue.put(limit_start)
             limit_start += self.num
Example #4
0
    def _get_img_urls(self):
        while not self.post_queue.empty():
            limit_start = self.post_queue.get()
            url = self.base_url + str(limit_start) + "&num=" + str(self.num)
            data = utils.download_page(url, proxies=self.proxies)
            if data:
                imgs = self.img_re.findall(data)
                for img in imgs:
                    img = img.replace('\\', '')

                    if not self.need_save:
                        self.imglog.info("%s" % img)
                    else:
                        self.img_queue.put(img)
Example #5
0
    def _get_img_urls(self):
        while not self.post_queue.empty():
            limit_start = self.post_queue.get()
            url = self.base_url + str(limit_start) + "&num=" + str(self.num)
            data = utils.download_page(url, proxies=self.proxies)
            if data:
                imgs = self.img_re.findall(data)
                for img in imgs:
                    img = img.replace('\\', '')

                    if not self.need_save:
                        self.imglog.info("%s" % img)
                    else:
                        self.img_queue.put(img)
Example #6
0
def get_single_chapter(name, chapter, url):
    folder = os.path.join(name.replace(' ', '_'), "ch{}".format(chapter))
    utils.mkdir_p(folder)
    for page, img in _get_pages(url):
        utils.download_page(folder, page, img)
    print 'making cbz for', folder
Example #7
0
    for (rex, format) in zip(rexs, formats):

        start_pos = 0
        n = len(string)
        # print format
        while 0 <= start_pos < n:
            match = rex.search(string, start_pos)
            if match:
                # print "\t" + match.group()
                try:
                    dates.append(DateInfo(match.group(1), format, match.start(1), match.end(1)))
                except:
                    pass
                start_pos = match.end() + 1
            else:
                break
    dates = [d for d in dates if d.dateRange.start <= date(3000, 1, 1)]
    return dates


# for testing
from utils import download_page

if __name__ == "__main__":
    pp = pprint.PrettyPrinter(indent=4)
    page_text = download_page(
            "http://starforce.eu/")
    date_strs = extract_dates(page_text)
    for date_str in date_strs:
        print date_str.startPos, date_str.endPos