def extract_product(self): url_prefix = "https://www.ceneo.pl" url_postfix = "#tab=reviews" url = url_prefix + "/" + self.product_id + url_postfix page_respons = requests.get(url) page_tree = BeautifulSoup(page_respons.text, 'html.parser') self.name = extract_element(page_tree, "h1", "product-name") try: opinions_count = int( extract_element(page_tree, "a", "product-reviews-link", "span")) except AttributeError: opinions_count = 0 if opinions_count > 0: while url: page_respons = requests.get(url) page_tree = BeautifulSoup(page_respons.text, 'html.parser') opinions = page_tree.find_all("div", "js_product-review") for opinion in opinions: op = Opinion() op.extract_opinion(opinion) op.transform_opinion() self.opinions.append(op) try: url = url_prefix + page_tree.find( "a", "pagination__next")["href"] except TypeError: url = None
def extract_product(self): url_prefix = "https://www.ceneo.pl" url_postfix = "#tab=reviews" url = url_prefix + "/" + self.product_id + url_postfix page_respons = requests.get(url) page_tree = BeautifulSoup(page_respons.text, 'html.parser') self.name = extract_element(page_tree, "h1", "product-name") try: opinions_count = int( extract_element(page_tree, "a", "product-reviews-link", "span")) except AttributeError: opinions_count = 0 if opinions_count > 0: while url: #pobranie kodu html strony z podanego URL page_respons = requests.get(url) page_tree = BeautifulSoup(page_respons.text, 'html.parser') #wydobycie z kodu HTML strony fragmentów odpowiadających poszczególnym opisom opinions = page_tree.find_all("div", "js_product-review") #wydobycie składowych dla pojedynczych opinii for opinion in opinions: op = Opinion() op.extract_opinion(opinion) op.transform_opinion self.opinions.append(op) try: url = url_prefix + page_tree.find( "a", "pagination__next")["href"] except TypeError: url = None
def extract_product(self): url_prefix = 'https://www.ceneo.pl' url_postfix = '#tab=reviews' url = url_prefix + '/' + self.product_id + url_postfix page_respons = requests.get(url) page_tree = BeautifulSoup(page_respons.text, 'html.parser') self.name = extract_element(page_tree, 'h1', 'product-name') try: opinions_count = int( extract_element(page_tree, 'a', 'product-reviews-link', 'span')) except AttributeError: opinions_count = 0 if opinions_count > 0: while url: #pobranie kodu html strony z podanego URL page_respons = requests.get(url) page_tree = BeautifulSoup(page_respons.text, 'html.parser') #wydobycie z kodu HTML strony fragmentów odpowiadających poszczególnym opiniom opinions = page_tree.find_all('div', 'js_product-review') #wydobycie składowych dla pojedyńczej opinii for opinion in opinions: op = Opinion() op.extract_opinion(opinion) op.transform_opinion self.opinions.append(op) try: url = url_prefix + page_tree.find( 'a', 'pagination__next')['href'] except TypeError: url = None
def extract_opinion(self, opinion): for key, args in self.tags.items(): setattr(self, key, extract_element(opinion, *args)) self.opinion_id = int(opinion["data-entry-id"]) try: self.pros = ", ".join( pros.get_text().strip() for pros in opinion.find( "div", "review-feature__title--positives" ).find_next_siblings("div", "review-feature__item")) except AttributeError: self.pros = None try: self.cons = ", ".join( cons.get_text().strip() for cons in opinion.find( "div", "review-feature__title--negatives" ).find_next_siblings("div", "review-feature__item")) except AttributeError: self.cons = None dates = opinion.find("span", "user-post__published").find_all("time") self.review_date = dates.pop(0)["datetime"] try: self.purchase_date = dates.pop(0)["datetime"] except IndexError: self.purchase_date = None
def extract_opinion(self, opinion): for key, args in self.tags.items(): setattr(self, key, extract_element(opinion, *args)) self.opinion_id = int(opinion['data-entry-id']) try: self.pros = ', '.join( pros.get_text().strip() for pros in opinion.find( 'div', 'review-feature__title--positives' ).find_next_siblings('div', 'review-feature__item')) except AttributeError: self.pros = None try: self.cons = ', '.join( cons.get_text().strip() for cons in opinion.find( 'div', 'review-feature__title--negatives' ).find_next_siblings('div', 'review-feature__item')) except AttributeError: self.cons = None dates = opinion.find('span', 'user-post__published').find_all('time') self.review_date = dates.pop(0)['datetime'] try: self.purchase_date = dates.pop(0)['datetime'] except IndexError: self.purchase_date = None
def extract_opinion(self, opinion): for key, args in self.selectors.items(): setattr(self, key, extract_element(opinion, *args)) self.opinion_id = opinion["data-entry-id"] return self