Exemple #1
0
    def parse_articles(self, response):
        # init
        article = ArticleItem()
        article_snapshot = ArticleSnapshotItem()
        # get current time
        now = int(time.time())

        # populate article item
        article["site_id"] = self.site_id
        article["url"] = response.url
        article["article_type"] = self.assign_article_type()
        article["first_snapshot_at"] = now
        article["last_snapshot_at"] = now
        article["snapshot_count"] = 1
        article["next_snapshot_at"] = generate_next_snapshot_time(
            self.site_url, article["snapshot_count"], now)
        if "redirect_urls" in response.meta.keys():
            article["url"] = response.request.meta["redirect_urls"][0]
            article["redirect_to"] = response.url
        else:
            article["redirect_to"] = None
        article["url_hash"] = zlib.crc32(article["url"].encode())

        # populate article_snapshot item
        article_snapshot["raw_data"] = response.text
        article_snapshot["snapshot_at"] = now

        yield {"article": article, "article_snapshot": article_snapshot}
Exemple #2
0
    def crawl_and_insert(self, article_urls):
        for url in article_urls:
            url_hash = str(zlib.crc32(url.encode()))
            if self.db_has(url, url_hash):
                continue

            now = int(time.time())
            snapshot = requests.get(url, cookies={"over18": "1"}).text
            next_snapshot_at = generate_next_snapshot_time(
                self.site_url, snapshot_count=1, snapshot_time=now
            )

            inserted_article_id = self.queries.insert_article(
                site_id=self.site_id,
                url=url,
                url_hash=url_hash,
                first_snapshot_at=now,
                last_snapshot_at=now,
                next_snapshot_at=next_snapshot_at,
                snapshot_count=1,
                redirect_to=None,
                article_type="PTT",
            )

            self.queries.insert_snapshot(
                article_id=inserted_article_id, snapshot_at=now, raw_data=snapshot
            )
            logger.info(
                f"Finish discover {url}, new article_id = {inserted_article_id}"
            )
    def update_post(self, response, post_response, article_id, snapshot_count):
        comment_response = json.loads(response.body.decode("utf-8"))
        # init
        article = ArticleItem()
        article_snapshot = ArticleSnapshotItem()
        now = int(time.time())

        # populate article item
        # copy from the original article
        article["article_id"] = article_id
        # update
        article["last_snapshot_at"] = now

        if response.status in self.handle_httpstatus_list:
            article["snapshot_count"] = snapshot_count
            article["next_snapshot_at"] = 0
            article_snapshot = None

        else:
            article["snapshot_count"] = snapshot_count + 1
            article["next_snapshot_at"] = generate_next_snapshot_time(
                "dcard", article["snapshot_count"], now
            )

            # populate article_snapshot item
            post_comments = {"post": post_response, "comments": comment_response}
            article_snapshot["raw_data"] = json.dumps(post_comments)
            article_snapshot["snapshot_at"] = now
            article_snapshot["article_id"] = article_id

        yield {"article": article, "article_snapshot": article_snapshot}
Exemple #4
0
    def get_comments(self, response, post_id, post_info):
        comments_api_result = json.loads(response.body.decode("utf-8"))
        # prepare Items
        article = ArticleItem()
        article_snapshot = ArticleSnapshotItem()
        # get current time
        now = int(time.time())

        # populate article item
        article["site_id"] = self.site_id
        article[
            "url"] = f"https://www.dcard.tw/f/{self.forum_name}/p/{post_id}"
        article["url_hash"] = zlib.crc32(article["url"].encode())
        article["article_type"] = "Dcard"
        article["first_snapshot_at"] = now
        article["last_snapshot_at"] = now
        article["snapshot_count"] = 1
        article["next_snapshot_at"] = generate_next_snapshot_time(
            "dcard", article["snapshot_count"], now)
        article["redirect_to"] = None

        # populate article_snapshot item
        post_comments = {"post": post_info, "comments": comments_api_result}
        article_snapshot["raw_data"] = json.dumps(post_comments)
        article_snapshot["snapshot_at"] = now

        yield {"article": article, "article_snapshot": article_snapshot}
Exemple #5
0
def discover(args):
    # check if args.url exists in db, if so, print message and exist.
    url = url_normalize(args.url)
    url_hash = str(zlib.crc32(url.encode()))
    result = queries.get_article_id_by_url(url=url, url_hash=url_hash)
    if result is not None:
        logger.info(
            f"URL exists in the database, with article_id {result['article_id']}. Please do update instead"
        )
        return

    crawler_config = SiteConfig.default()
    user_agent = args.ua or crawler_config["ua"]
    now = int(time.time())
    if "dcard" in args.url:
        snapshot = get_dcard_article(args.url, user_agent)
    elif "ptt" in args.url:
        snapshot = get_article_by_request(args.url, user_agent, cookies={"over18": "1"})
    elif args.selenium:
        snapshot = get_article_by_selenium(args.url, user_agent)
    else:
        snapshot = get_article_by_request(args.url, user_agent)

    article_type = helpers.get_article_type(args.url)
    url_hash = zlib.crc32(args.url.encode())
    if args.site_id:
        site_info = queries.get_site_by_id(site_id=args.site_id)
    else:
        site_info = None

    next_snapshot_at = (
        0
        if site_info is None
        else helpers.generate_next_snapshot_time(
            site_info["url"], snapshot_count=1, snapshot_time=now
        )
    )

    inserted_article_id = queries.insert_article(
        site_id=args.site_id,
        url=args.url,
        url_hash=url_hash,
        first_snapshot_at=now,
        last_snapshot_at=now,
        next_snapshot_at=next_snapshot_at,
        snapshot_count=1,
        redirect_to=None,
        article_type=article_type,
    )

    queries.insert_snapshot(
        article_id=inserted_article_id, snapshot_at=now, raw_data=snapshot
    )
    logger.info(f"Finish discover {args.url}, new article_id = {inserted_article_id}")
    return inserted_article_id
Exemple #6
0
def update_article_table(article_info, site_info, crawl_time):
    next_snapshot_at = article_info
    article_id = article_info["article_id"]
    snapshot_count = article_info["snapshot_count"]

    if next_snapshot_at != 0:
        next_snapshot_at = helpers.generate_next_snapshot_time(
            site_info["url"], snapshot_count=1, snapshot_time=crawl_time
        )
    queries.update_article_snapshot_time(
        article_id=article_id,
        last_snapshot_at=crawl_time,
        snapshot_count=snapshot_count + 1,
        next_snapshot_at=next_snapshot_at,
    )
    def update_article(self, response, article_id, snapshot_count):
        article = ArticleItem()
        article_snapshot = ArticleSnapshotItem()
        now = int(time.time())

        article["article_id"] = article_id
        article["last_snapshot_at"] = now

        if response.status in self.handle_httpstatus_list:
            article["snapshot_count"] = snapshot_count
            article["next_snapshot_at"] = 0
            article_snapshot = None
        else:
            article["snapshot_count"] = snapshot_count + 1
            article["next_snapshot_at"] = generate_next_snapshot_time(
                self.site_url, article["snapshot_count"], now)

            article_snapshot["raw_data"] = response.text
            article_snapshot["snapshot_at"] = now
            article_snapshot["article_id"] = article_id

        yield {"article": article, "article_snapshot": article_snapshot}