def extract_item_urls(self, response): try: html = safely_json_loads(response.body)['data']["html"] except Exception: html = response.body sel = Selector(text=html) return [ response.urljoin(x) for x in set(sel.xpath("|".join(self.item_pattern)).extract()) ]
def enrich_answers(self, item_loader, response): data = safely_json_loads(response.body) nodes = list() for answer in data["answers"]: answer_item_loader = CustomLoader(item=AnswerItem()) answer_item_loader.add_value("upvotes", answer["useness"]) answer_item_loader.add_value("author", answer["user"]["name"]) answer_item_loader.add_value("datetime", answer["created_at"]) answer_item_loader.add_value("content", replace_entities(answer["content"])) num_of_comments = answer["num_of_comments"] if num_of_comments: answer_id = answer["id"] for start in range(0, num_of_comments, 20): reply_url = "%sanswers/%s/comments/?start=%s" % ( response.url.split("?")[0], answer_id, start) nodes.append(("replies", answer_item_loader, { "url": reply_url })) else: item_loader.add_value("answers", answer_item_loader.load_item()) return nodes
def enrich_replies(self, item_loader, response): self.logger.debug("Start to enrich_replies. ") data = safely_json_loads(response.body) item_loader.add_value("replies", data["comments"])