Ejemplo n.º 1
0
    def check_higher_cn(self, response):
        # keep stepping upwards until the first name remains the same (at which point we've reached the cn for the first comment 
        if CommentsSpider.has_correct_content_type(response):
            new_first_user = sel.first_user(conv.body_html(response.body))
            if new_first_user != self.first_user:
                self.first_user = new_first_user
                self.starting_cn = self.higher_cn
                self.higher_cn = self.starting_cn + dub_step
                yield scrapy.Request(
                    url=self.page_url.format(cn=self.higher_cn),
                    cookies=self.cookies,
                    callback=self.check_higher_cn
                )
            else:
                if self.prev_highest_cn != self.starting_cn:
                    print("New highest observed comment number: {}".format(self.starting_cn))
                    self.db.update_highest_cn(self.starting_cn, self.kind)

                # Keep stepping down the cn recording comments until there are no more comments
                self.current_cn = self.starting_cn
                yield scrapy.Request(
                    url=self.page_url.format(cn=self.current_cn),
                    cookies=self.cookies,
                    callback=self.parse
                )
Ejemplo n.º 2
0
    def parse(self, response):
        print("-----xxxxx-----xxxxxx------" + str(self.brain.current_step))
        if CommentsSpider.has_correct_content_type(response) and CommentsSpider.response_long_enough(response) and self.brain.steps_without_new_content < 100:
            comments_to_save = []
            for comment in sel.all_comments(conv.body_html(response.body)):
                comment_data = sel.comment_data(comment)
                if not self.brain.is_duplicate(comment_data):
                    comments_to_save.append(comment_data)

            self.current_cn -= self.brain.step()
            yield scrapy.Request(
                url=self.page_url.format(cn=self.current_cn),
                cookies=self.cookies,
                callback=self.parse
            )

            for comment in comments_to_save:
                comment_id = self.db.save_comment(comment, self.supplier_id)

                for name, reaction_id in self.reaction_types.items():
                    yield scrapy.Request(
                        url=self.reaction_url.format(cid=comment["cid"], reaction_id=reaction_id),
                        cookies=self.cookies,
                        callback=self.save_reactions,
                        meta={
                            "cid": comment_id,
                            "reaction": name,
                        }
                    )
                
                print("saving meta commenter")
                yield self.meta_commenter_request(comment_id, comment["cid"])   
        else:
            print("Scrape finished {}".format(self.supplier_name))
Ejemplo n.º 3
0
 def find_starting_cn(self, response):
     if CommentsSpider.has_correct_content_type(response):
         self.starting_cn = self.prev_highest_cn
         self.higher_cn = self.starting_cn + dub_step
         self.first_user = sel.first_user(conv.body_html(response.body))
         yield scrapy.Request(url=self.base_url.format(cn=self.higher_cn),
                              cookies=self.cookies,
                              callback=self.check_higher_cn)
Ejemplo n.º 4
0
    def parse(self, response):
        print("-----xxxxx-----xxxxxx------" + str(self.brain.current_step))
        if CommentsSpider.has_correct_content_type(
                response) and CommentsSpider.response_long_enough(response):
            comments_to_save = []
            for comment in sel.all_comments(conv.body_html(response.body)):
                comment_data = sel.comment_data(comment)
                if not self.brain.is_duplicate(comment_data):
                    comments_to_save.append(comment_data)

            self.current_cn -= self.brain.step()
            yield scrapy.Request(url=self.base_url.format(cn=self.current_cn),
                                 cookies=self.cookies,
                                 callback=self.parse)

            for comment in comments_to_save:
                self.db.save_comment(comment, self.supplier_id)
        else:
            print("Scrape finished")
Ejemplo n.º 5
0
 def save_reactions(self, response):
     comment_id = response.meta.get("cid")
     reaction_kind = response.meta.get("reaction")
     reactions = sel.reactions(conv.body_html(response.body), reaction_kind)
     self.db.save_reactions(comment_id, reactions)