def extract_item_urls(self, response):
     try:
         html = safely_json_loads(response.body)['data']["html"]
     except Exception:
         html = response.body
     sel = Selector(text=html)
     return [
         response.urljoin(x)
         for x in set(sel.xpath("|".join(self.item_pattern)).extract())
     ]
Beispiel #2
0
 def enrich_answers(self, item_loader, response):
     data = safely_json_loads(response.body)
     nodes = list()
     for answer in data["answers"]:
         answer_item_loader = CustomLoader(item=AnswerItem())
         answer_item_loader.add_value("upvotes", answer["useness"])
         answer_item_loader.add_value("author", answer["user"]["name"])
         answer_item_loader.add_value("datetime", answer["created_at"])
         answer_item_loader.add_value("content",
                                      replace_entities(answer["content"]))
         num_of_comments = answer["num_of_comments"]
         if num_of_comments:
             answer_id = answer["id"]
             for start in range(0, num_of_comments, 20):
                 reply_url = "%sanswers/%s/comments/?start=%s" % (
                     response.url.split("?")[0], answer_id, start)
                 nodes.append(("replies", answer_item_loader, {
                     "url": reply_url
                 }))
         else:
             item_loader.add_value("answers",
                                   answer_item_loader.load_item())
     return nodes
Beispiel #3
0
 def enrich_replies(self, item_loader, response):
     self.logger.debug("Start to enrich_replies. ")
     data = safely_json_loads(response.body)
     item_loader.add_value("replies", data["comments"])