def parse_thread(self, response): """Navigates from a Page in a Thread to the next Page in that Thread. Navigates from a Page in a Thread to each User that commented on this Page. Searches and scrapes valid Bitcoins from this Page. """ # Navigate to every Commenter's Profile page list_users = response.css('.poster_info b a::attr(href)').extract() for href in list_users: logging.debug("Page {}, going to User {}".format( response.url, href)) yield scrapy.Request(href, callback=self.parse_user_profile) # Search each comment for a valid Bitcoin address for comment in itertools.chain( response.css('.windowbg').extract(), response.css('windowbg2').extract()): valid_addresses = collect_bitcoins(str.encode(comment)) if len(valid_addresses) > 0: logging.debug("Page {}, yielding a Comment".format( response.url)) yield { "comment_url": response.url, "bitcoin_addresses": valid_addresses, "comment_text": comment } # Navigate to the next Page in the Thread next_page = response.css('.prevnext .navPages ::attr(href)').extract() if len(next_page) > 0: logging.debug("Page {}, going to Page {}".format( response.url, next_page[-1])) yield scrapy.Request(next_page[-1], callback=self.parse_thread)
def parse_user_profile(self, response): # Used to find Threads on a Board Page. Will navigate to next Board Page bitcoins = bitcoin_helper.collect_bitcoins(response.body) # print("LENGTH OF LIST: "+str(len(bitcoins))) if len(bitcoins) > 0: user_id = response.css( '.windowbg tr:nth-child(1) td:nth-child(2)::text' ).extract_first() yield { "user_id": user_id, "Profile URL": response.url, "bitcoin_addresses": bitcoins }
def parse_user_profile(self, response): """Searches and scrapes valid Bitcoins from a User's Profile page. """ # Search for valid bitcoins bitcoins = collect_bitcoins(response.body) if len(bitcoins) > 0: user_id = response.css( '.windowbg tr:nth-child(1) td:nth-child(2)::text' ).extract_first() logging.debug("User {}, yielding Bitcoins".format(response.url)) yield { "user_id": user_id, "profile_url": response.url, "bitcoin_addresses": bitcoins }
def parse_page(self, response): """Searches and scrapes valid Bitcoins from this Page. """ # Search each comment for a valid Bitcoin address for comment in itertools.chain( response.css('.windowbg').extract(), response.css('windowbg2').extract()): valid_addresses = collect_bitcoins(str.encode(comment)) if len(valid_addresses) > 0: logging.debug("Page {}, yielding Bitcoins".format( response.url)) p = parse_comment(comment) comment = { "username": p["username"], "bitcoin_addresses": valid_addresses, "profile_url": p["profile_url"], "date": p["date"], "comment": p["comment"], "comment_url": response.url } yield comment