def download_page(soup, category_name=None): product_list = soup.find('div', 'n-snippet-list').find_all('div', 'n-snippet-card2', recursive=False) logger.debug('download_item got {} items'.format(len(product_list))) for i in product_list: download_item(i, category_name)
async def main(link_set): while link_set: url_object_form_json = link_set.pop() logger.debug(f'url_object_form_json {str(url_object_form_json)}') tasks.append(asyncio.create_task(download_page(url_object_form_json))) # print(tasks) await asyncio.gather(*tasks)
def __init__(self, soup): self.soup = soup self.pros = '' self.cons = '' self.comment = '' # write_tmp_soup(soup) try: self.id = soup['data-review-id'] except KeyError: self.id = soup.div['data-review-id'] data = self.soup.find('div', {'itemprop': 'review'}) if data: logger.debug('Review trying NEW method...') self.date = datetime.strptime( data.find('meta', {'itemprop': 'datePublished'})['content'], '%Y-%m-%d') self.author = data.find('meta', {'itemprop': 'author'})['content'] self.description = data.find( 'meta', {'itemprop': 'description'})['content'] self.rating = data.find('div', { 'itemprop': 'reviewRating' }).find('meta', {'itemprop': 'ratingValue'})['content'] self.parse_description() else: logger.debug('Review trying OLD method...') self.date = self.process_date_string( soup.find('span', 'n-product-review-item__date-region').string) self.author = soup.find('', 'n-product-review-user__name').string # self.comment = soup.find('dd', 'n-product-review-item__text').string # self.description = f'{self.COMMENT_WORD} {self.comment}' self.rating = soup.find('div', 'rating__value').string self.process_description_from_markup()
def session(self): if self.__session__: return self.__session__ print(self.model.__tablename__, ' session init') logger.debug(f'{self.__class__} session init') self.recreate_session() return self.__session__
def get_reviews_all(self): logger.debug('ReviewList::getting data from page 1') self.get_reviews_from_page(self.soup) for page_num in range(2, self.pages + 1): new_url = '{}&{}'.format(self.url, PAGE_PARAM.format(page_num)) logger.debug( 'ReviewList::getting data from page {}'.format(page_num)) soup = get_soup(new_url) self.get_reviews_from_page(soup)
async def download_link(soup, category_name=None): session = Session() product_list = soup.find('div', 'n-snippet-list').find_all('div', 'n-snippet-card2', recursive=False) logger.debug('download_link got {} items'.format(len(product_list))) tasks = [] for i in product_list: tasks.append( asyncio.create_task( download_item(session=session, product_soup=i, category_name=category_name))) print('LOOP END') await asyncio.gather(*tasks)
def download_list_all(url_object): category_name, url = url_object.get('category_name'), url_object['url'] soup = get_soup(url) pages_count = get_pages_count(soup) logger.debug(f'Found {pages_count} pages for list url {url}') logger.debug('download_list::getting data from page 1') download_page(soup, category_name) for page_num in range(2, pages_count + 1): logger.debug(f'download_list::getting data from page {page_num}') new_url = '{}&{}'.format(url, PAGE_PARAM.format(page_num)) soup = get_soup(new_url) download_page(soup, category_name)
async def download_page(url_object): category_name, url = url_object.get('category_name'), url_object['url'] # try: #todo: remove all tries like that soup = await get_soup(url) # if not soup: # logger.error(f'{category_name}, {url}') # raise TypeError # except (ConnectionError, TimeoutError): # logger.critical(f'Some shit happened to url: {url} :: 64') # return pages_count = get_pages_count(soup) logger.debug(f'Found {pages_count} pages for list url {url}') logger.debug('download_page::getting data from page 1') tasks = [asyncio.create_task(download_link(soup, category_name))] for page_num in range(2, pages_count + 1): logger.debug(f'download_list::getting data from page {page_num}') new_url = '{}&{}'.format(url, PAGE_PARAM.format(page_num)) soup = await get_soup(new_url) tasks.append(asyncio.create_task(download_link(soup, category_name))) await asyncio.gather(*tasks)
download_item(i, category_name) def download_list_all(url_object): category_name, url = url_object.get('category_name'), url_object['url'] soup = get_soup(url) pages_count = get_pages_count(soup) logger.debug(f'Found {pages_count} pages for list url {url}') logger.debug('download_list::getting data from page 1') download_page(soup, category_name) for page_num in range(2, pages_count + 1): logger.debug(f'download_list::getting data from page {page_num}') new_url = '{}&{}'.format(url, PAGE_PARAM.format(page_num)) soup = get_soup(new_url) download_page(soup, category_name) if __name__ == '__main__': link_set = get_link_set() # link_set = json.load(open(LINK_SET_PATH, 'r', encoding='utf8')) logger.info('START\n') # random.shuffle(link_set) DOWNLOAD_IMAGES = input('DOWNLOAD_IMAGESs? (Y/N)\n').lower() == 'y' RESOLVE_OTHER_SHOP_URL = input('RESOLVE_OTHER_SHOP_URL? (Y/N)\n').lower() == 'y' while link_set: url_object_form_json = link_set.pop() logger.debug(f'url_object_form_json {str(url_object_form_json)}') download_list_all(url_object_form_json)
def __init__(self, url): self.url = url self.soup = get_soup(url) self.pages = get_pages_count(self.soup) logger.debug('Review pages found: {}'.format(self.pages)) self.get_reviews_all()
def get_reviews_from_page(self, soup): review_items = soup.find_all('div', 'n-product-review-item') logger.debug('ReviewList got {} items'.format(len(review_items))) for i in review_items: self.append(Review(i))