Esempio n. 1
0
class Spider(object):

    start_urls = ['']

    def __init__(self):
        self.count = 0  # 用于记录xls的行数
        self.downloader = Downloader()
        self.next_url = None
        self.mutex = Lock()  # 创建一个锁 用于文件的读写

    def __enter__(self):
        # 创建 xls 文件对象
        self.wb = xlwt.Workbook()
        # 新增一个表单
        self.sheet = self.wb.add_sheet('Sheet 1')

        self.start_request()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.wb.save('data.xls')

    def start_request(self):
        for url in self.start_urls:
            self.downloader.add_job(priority_number=1,
                                    job=url,
                                    handle=self.parse)
        self.downloader.create_threads()

    def parse(self, response):
        pass
Esempio n. 2
0
def main(cache_dir):
    okapi = Okapi(OC_OKAPI_KEY, user_agent=USER_AGENT)

    os.makedirs(cache_dir, exist_ok=True)
    os.makedirs(os.path.join(cache_dir, "json"), exist_ok=True)
    os.makedirs(os.path.join(cache_dir, "orig"), exist_ok=True)
    os.makedirs(os.path.join(cache_dir, "small"), exist_ok=True)
    os.makedirs(os.path.join(cache_dir, "big"), exist_ok=True)

    file_name = os.path.join(cache_dir, "json", "caches.json")
    if os.path.isfile(file_name):
        json_data = load_json(file_name)
    else:
        print("-- downloading query...")
        oc_codes = download_query(OC_USERNAME, OC_PASSWORD, OC_QUERYID)
        try:
            with open(MANUAL_CACHES_FILE, "r") as f:
                for oc_code in f:
                    oc_code = oc_code.strip()
                    if oc_code.startswith("OC"):
                        print("-- adding manual code {}".format(oc_code))
                        oc_codes.append(oc_code)
        except IOError:
            pass

        print("-> codes: {}".format(len(oc_codes)))
        fields = [
            'code', 'name', 'location', 'status', 'url', 'owner', 'founds',
            'date_hidden', 'date_created', 'short_description', 'description',
            'images', 'preview_image', 'internal_id'
        ]
        json_data = okapi.get_caches(oc_codes, fields)
        store_json(file_name, json_data)

    print("-- analyzing cache data...")
    caches = load_caches(json_data)
    caches = sorted(caches, key=lambda c: c._date, reverse=True)
    print("-> caches: {}".format(len(caches)))

    print("-- analyzing log data...")
    total_logs = 0
    logs_without_coords = 0
    for cache in caches:
        file_name = os.path.join(cache_dir, "json", f"{cache._code}-logs.json")
        if os.path.isfile(file_name):
            json_data = load_json(file_name)
        else:
            fields = [
                'uuid', 'date', 'user', 'type', 'comment', 'images',
                'internal_id'
            ]
            json_data = okapi.get_logs(cache._code, fields)
            store_json(file_name, json_data)
        cache._logs = load_logs(json_data)

        for log in cache._logs:
            total_logs += 1
            if log._coordinates is None:
                logs_without_coords += 1
    print("-- logs without coordinates: {}/{}".format(logs_without_coords,
                                                      total_logs))

    print("-- downloading missing images...")
    downloader = Downloader(threads=4, user_agent=USER_AGENT)
    thumbnailer = Thumbnailer(threads=4)
    for cache in caches:
        if cache._preview_image is not None:
            extension = 'noext'
            m = re.match('^.*\.([^.\?]+)(\?.*)?$', cache._preview_image)
            if m:
                extension = m.group(1)
            raw_image = '{}/{}/{}.{}'.format(cache_dir, "orig", cache._code,
                                             extension)
            downloader.add_job(cache._preview_image, raw_image)
            thumb_small = '{}/{}/{}.jpg'.format(cache_dir, "small",
                                                cache._code)
            thumbnailer.add_job(raw_image, thumb_small, SIZE_SMALL)
            thumb_big = '{}/{}/{}.jpg'.format(cache_dir, "big", cache._code)
            thumbnailer.add_job(raw_image, thumb_big, SIZE_BIG)
    downloader.run()

    print("-- scaling images...")
    thumbnailer.run()

    print("-- creating files...")
    create_db(caches, os.path.join(cache_dir, "safari.sqlite"))
    collect_logs(caches, os.path.join(cache_dir, "log-data.js"))
    createlist(caches, 30, cache_dir)
    create_feed(caches, os.path.join(cache_dir, "feed.xml"))
    create_sidebar(caches, "static/index.html",
                   os.path.join(cache_dir, "index.html"), cache_dir)
Esempio n. 3
0
def main():
    okapi = Okapi(OC_OKAPI_KEY, user_agent=USER_AGENT)

    mkdir(CACHE_DIR)
    mkdir("{}/json".format(CACHE_DIR))
    mkdir("{}/orig".format(CACHE_DIR))
    mkdir("{}/small".format(CACHE_DIR))
    mkdir("{}/big".format(CACHE_DIR))

    file_name = "{}/json/caches.json".format(CACHE_DIR)
    if os.path.isfile(file_name):
        json_data = load_json(file_name)
    else:
        print("-- downloading query...")
        oc_codes = download_query(OC_USERNAME, OC_PASSWORD, OC_QUERYID)
        try:
            with open(MANUAL_CACHES_FILE, "r") as f:
                for oc_code in f:
                    oc_code = oc_code.strip()
                    if oc_code.startswith("OC"):
                        print("-- adding manual code {}".format(oc_code))
                        oc_codes.append(oc_code)
        except IOError:
            pass

        print("-> codes: {}".format(len(oc_codes)))
        fields = ['code', 'name', 'location', 'status', 'url', 'owner', 'founds', 'date_hidden', 'date_created',
                  'short_description', 'description', 'images', 'preview_image', 'internal_id']
        json_data = okapi.get_caches(oc_codes, fields)
        store_json(file_name, json_data)

    print("-- analyzing cache data...")
    caches = load_caches(json_data)
    caches = sorted(caches, key=lambda c: c._date, reverse=True)
    print("-> caches: {}".format(len(caches)))

    print("-- analyzing log data...")
    total_logs = 0
    logs_without_coords = 0
    for cache in caches:
        file_name = "{}/json/{}-logs.json".format(CACHE_DIR, cache._code)
        if os.path.isfile(file_name):
            json_data = load_json(file_name)
        else:
            fields = ['uuid', 'date', 'user', 'type', 'comment', 'images', 'internal_id']
            json_data = okapi.get_logs(cache._code, fields)
            store_json(file_name, json_data)
        cache._logs = load_logs(json_data)

        for log in cache._logs:
            total_logs += 1
            if log._coordinates is None:
                logs_without_coords += 1
    print("-- logs without coordinates: {}/{}".format(logs_without_coords, total_logs))

    print("-- downloading missing images...")
    downloader = Downloader(threads=4, user_agent=USER_AGENT)
    thumbnailer = Thumbnailer(threads=4)
    for cache in caches:
        if cache._preview_image is not None:
            extension = 'noext'
            m = re.match('^.*\.([^.\?]+)(\?.*)?$', cache._preview_image)
            if m:
                extension = m.group(1)
            raw_image = '{}/{}/{}.{}'.format(CACHE_DIR, "orig", cache._code, extension)
            downloader.add_job(cache._preview_image, raw_image)
            thumb_small = '{}/{}/{}.jpg'.format(CACHE_DIR, "small", cache._code)
            thumbnailer.add_job(raw_image, thumb_small, SIZE_SMALL)
            thumb_big = '{}/{}/{}.jpg'.format(CACHE_DIR, "big", cache._code)
            thumbnailer.add_job(raw_image, thumb_big, SIZE_BIG)
    downloader.run()

    print("-- scaling images...")
    thumbnailer.run()

    print("-- creating files...")
    create_db(caches, ".cache/safari.sqlite")
    collect_logs(caches, ".cache/log-data.js")
    createlist(caches, 30)
    create_feed(caches, ".cache/feed.xml")
    create_sidebar(caches, "static/index.html", ".cache/index.html")
class Aggregator:
    def __init__(
        self,
        podcasts_json: str,
        cache_dir: str,
        export_dir: str,
        templates_dir: str,
        base_url: str,
    ) -> None:
        self._downloader = Downloader(
            4, "Lauf Podcast Aggregator, https://lauf-podcasts.flopp.net/")
        self._imagescaler = ImageScaler(4)
        self._podcasts_json_file = podcasts_json
        self._cache_dir = cache_dir
        self._export_dir = export_dir
        self._base_url = base_url
        self._podcasts: List[Dict[str, Any]] = []
        self._jinja = jinja2.Environment(
            loader=jinja2.FileSystemLoader(templates_dir), autoescape=True)
        self._jinja.filters["timestamp2date"] = self.format_date
        self._jinja.filters["timestamp2datetime"] = self.format_datetime
        self._jinja.filters["formatseconds"] = self.format_seconds
        self._jinja.globals["now"] = datetime.datetime.now().timestamp()
        self._jinja.globals["base_url"] = base_url

    def format_date(self, value: Optional[float]) -> str:
        if value is None:
            return "n/a"
        d = datetime.datetime.fromtimestamp(value)
        return d.strftime("%F")

    def format_datetime(self, value: Optional[float]) -> str:
        if value is None:
            return "n/a"
        d = datetime.datetime.fromtimestamp(value)
        return d.strftime("%F %T")

    def format_seconds(self, value: Optional[int]) -> str:
        if value is None:
            return "n/a"
        if value == 0:
            return "n/a"
        d = datetime.timedelta(seconds=value)
        return str(d)

    def clear_cache(self) -> None:
        if os.path.isdir(self._cache_dir):
            shutil.rmtree(self._cache_dir)

    def sync(self, keep_feeds: bool = False) -> None:
        with open(self._podcasts_json_file, "r") as f:
            self._podcasts = json.load(f)
        # filter podcasts without title or feed url
        self._podcasts = [
            p for p in self._podcasts if ("title" in p) and ("feed" in p)
        ]
        for podcast in self._podcasts:
            title = podcast["title"]
            feed_url = podcast["feed"]
            sanitized_title = self.sanitize(title)
            podcast["sanitized_title"] = sanitized_title
            dir = "{}/{}".format(self._cache_dir, sanitized_title)
            podcast["raw_dir"] = dir
            feed_file = "{}/feed".format(dir)
            podcast["feed_file"] = feed_file
            added = datetime.datetime.strptime(podcast["added"], "%Y-%m-%d")
            if added + datetime.timedelta(weeks=2) >= datetime.datetime.now():
                podcast["new"] = 1
            os.makedirs(dir, exist_ok=True)
            self._downloader.add_job(feed_url,
                                     feed_file,
                                     force=(not keep_feeds))
        self._downloader.run()
        # filter podcasts with non-existent feed file
        self._podcasts = [
            p for p in self._podcasts if os.path.exists(p["feed_file"])
        ]
        for podcast in self._podcasts:
            podcast["skip"] = False
            feed_url = podcast["feed"]
            feed_file = podcast["feed_file"]
            try:
                with open(feed_file, "r") as f:
                    podcast["data"] = podcastparser_parse(feed_url, f)
            except Exception:
                podcast["skip"] = True
                continue
            # determine latest publish date
            last_publish = None
            for episode in podcast["data"]["episodes"]:
                if not last_publish or episode["published"] > last_publish:
                    last_publish = episode["published"]
            podcast["last_publish"] = last_publish
            if last_publish is None:
                print("no last publish date: {}".format(podcast["title"]))
            # format descriptions
            podcast["data"]["description_formatted"] = self.format_description(
                podcast["data"]["description"])
            for episode in podcast["data"]["episodes"]:
                if "description_html" in episode:
                    episode["description_formatted"] = self.clean_html(
                        episode["description_html"])
                else:
                    episode["description_formatted"] = self.format_description(
                        episode["description"])
            # determine cover image
            cover_url = None
            if "cover_url" in podcast:
                cover_url = podcast["cover_url"]
            if not cover_url and ("cover_url" in podcast["data"]):
                cover_url = podcast["data"]["cover_url"]
            if cover_url:
                dir = "{}/{}".format(self._cache_dir,
                                     podcast["sanitized_title"])
                cover_file = "{}/cover".format(dir)
                self._downloader.add_job(cover_url, cover_file)
        self._downloader.run()
        # filter podcasts with skip attribute
        self._podcasts = [p for p in self._podcasts if not p["skip"]]
        # sort by reversed 'last_publish' timestamp
        self._podcasts.sort(key=lambda x: -x["last_publish"]
                            if x["last_publish"] is not None else 0)

    def clean_html(self, html: str) -> str:
        soup = BeautifulSoup(html, features="html.parser")
        for tag in soup():
            for attribute in ["class", "id", "name", "style"]:
                del tag[attribute]
        for tag_name in [
                "audio", "figure", "iframe", "img", "script", "video"
        ]:
            for tag in soup(tag_name):
                tag.decompose()
        return str(soup)

    def format_description(self, description: str) -> str:
        re_newline = re.compile(r"(\n)")
        re_divider = re.compile(r"((?:---+)|(?:\*\*\*+)|(?:\+\+\++))")
        re_link = re.compile(r"(https?://[A-Za-z0-9/.=\?&%_\-]+)")
        s = description
        s = re_newline.sub(r"<br />", s)
        s = re_divider.sub(r"<br />\1<br />", s)
        s = re_link.sub(r'<a href="\1" rel="nofollow" target="_blank">\1</a>',
                        s)
        return s

    def export(self) -> None:
        for podcast in self._podcasts:
            self._imagescaler.add_job(
                "{}/{}/cover".format(self._cache_dir,
                                     podcast["sanitized_title"]),
                "{}/{}/cover.jpg".format(self._export_dir,
                                         podcast["sanitized_title"]),
                512,
            )
        self._imagescaler.run()
        self.export_info()
        self.export_impressum()
        self.export_index()
        self.export_sitemap()
        for podcast in self._podcasts:
            self.export_podcast(podcast)
        self._imagescaler.run()

    def export_info(self) -> None:
        os.makedirs(self._export_dir, exist_ok=True)
        with open("{}/info.html".format(self._export_dir), "w") as f:
            template = self._jinja.get_template("info.html")
            f.write(template.render())

    def export_impressum(self) -> None:
        os.makedirs(self._export_dir, exist_ok=True)
        with open("{}/impressum.html".format(self._export_dir), "w") as f:
            template = self._jinja.get_template("impressum.html")
            f.write(template.render())

    def export_index(self) -> None:
        os.makedirs(self._export_dir, exist_ok=True)
        with open("{}/index.html".format(self._export_dir), "w") as f:
            template = self._jinja.get_template("index.html")
            f.write(template.render(podcasts=self._podcasts))

    def export_sitemap(self) -> None:
        os.makedirs(self._export_dir, exist_ok=True)
        with open("{}/sitemap.xml".format(self._export_dir), "w") as f:
            template = self._jinja.get_template("sitemap.xml")
            f.write(template.render(podcasts=self._podcasts))

    def export_podcast(self, podcast: Dict[str, Any]) -> None:
        dir = "{}/{}".format(self._export_dir, podcast["sanitized_title"])
        os.makedirs(dir, exist_ok=True)
        with open("{}/index.html".format(dir), "w") as f:
            template = self._jinja.get_template("podcast.html")
            f.write(
                template.render(podcast=podcast,
                                episodes=podcast["data"]["episodes"]))

    def sanitize(self, s: str) -> str:
        return re.sub(r"\W+", "-", s).lower()