class RatingSpider(Spider): name = "ratings" allowed_domains = ["livelib.ru"] def __init__(self, **kwargs): super(RatingSpider, self).__init__(**kwargs) self.mongo = Mongo() def start_requests(self): for book in self.mongo.books_collection().find(): readers_url = u"https://www.livelib.ru/book/{}/readers/read".format(book["lib_id"]) self.logger.info(u"Queuing page: {}".format(readers_url)) readers_request = Request(readers_url, callback=self.parse) readers_request.meta["book_lib_id"] = book["lib_id"] yield readers_request def parse(self, response): self.logger.info(u"Parsing page: {0}".format(response.url)) book_lib_id = response.meta["book_lib_id"] for rating_sel in response.xpath("//div[p/span[@class='rating']]"): stars = rating_sel.xpath(".//span[@class='rating']/span/@class").extract_first()[1:2] if stars != "0": rating = RatingItem() rating["book_lib_id"] = book_lib_id rating["user_lib_id"] = rating_sel.xpath(".//a[@class='action']/@title").extract_first() rating["rating"] = stars yield rating for href in response.xpath("//a[contains(@id, 'a-list-page-next')]/@href").extract(): url = response.urljoin(href) self.logger.info(u"Queuing page: {0}".format(url)) readers_request = Request(url, callback=self.parse) readers_request.meta["book_lib_id"] = book_lib_id yield readers_request
class UserSpider(Spider): name = "users" allowed_domains = ["livelib.ru"] def __init__(self, **kwargs): super(UserSpider, self).__init__(**kwargs) self.mongo = Mongo() def start_requests(self): for rating in self.mongo.ratings_collection().find(): if self.mongo.users_collection().find({"user_lib_id": rating["user_lib_id"]}).count() == 0: user_url = u"https://www.livelib.ru/reader/{0}".format(rating["user_lib_id"]) self.logger.info(u"Queuing page: {}".format(user_url)) user_request = Request(user_url, callback=self.parse) user_request.meta["user_lib_id"] = rating["user_lib_id"] yield user_request def parse(self, response): self.logger.info(u"Parsing page: {0}".format(response.url)) user = UserItem() user["user_lib_id"] = response.meta["user_lib_id"] profile_info = response.xpath("//div[@class='profile-info-column']") gender = profile_info.xpath(u".//span[@class='block-info'][contains(b/text(), 'Пол:')]/text()").extract_first() if gender: gender = gender.strip() if gender == u'женский': user["gender"] = "female" elif gender == u'мужской': user["gender"] = "male" birth_date = profile_info.xpath(u".//span[@class='block-info'][contains(b/text(), 'Дата рождения:')]/text(" u")").extract_first() if birth_date: user["birth_date"] = birth_date.strip() yield user
def __init__(self, **kwargs): super(RatingSpider, self).__init__(**kwargs) self.mongo = Mongo()