Esempio n. 1
0
def download_page(soup, category_name=None):
	product_list = soup.find('div', 'n-snippet-list').find_all('div', 'n-snippet-card2', recursive=False)

	logger.debug('download_item got {} items'.format(len(product_list)))

	for i in product_list:
		download_item(i, category_name)
Esempio n. 2
0
async def main(link_set):
    while link_set:
        url_object_form_json = link_set.pop()
        logger.debug(f'url_object_form_json {str(url_object_form_json)}')
        tasks.append(asyncio.create_task(download_page(url_object_form_json)))
    # print(tasks)
    await asyncio.gather(*tasks)
Esempio n. 3
0
    def __init__(self, soup):
        self.soup = soup
        self.pros = ''
        self.cons = ''
        self.comment = ''

        # write_tmp_soup(soup)

        try:
            self.id = soup['data-review-id']
        except KeyError:
            self.id = soup.div['data-review-id']

        data = self.soup.find('div', {'itemprop': 'review'})
        if data:
            logger.debug('Review trying NEW method...')
            self.date = datetime.strptime(
                data.find('meta', {'itemprop': 'datePublished'})['content'],
                '%Y-%m-%d')
            self.author = data.find('meta', {'itemprop': 'author'})['content']
            self.description = data.find(
                'meta', {'itemprop': 'description'})['content']
            self.rating = data.find('div', {
                'itemprop': 'reviewRating'
            }).find('meta', {'itemprop': 'ratingValue'})['content']
            self.parse_description()
        else:
            logger.debug('Review trying OLD method...')
            self.date = self.process_date_string(
                soup.find('span', 'n-product-review-item__date-region').string)
            self.author = soup.find('', 'n-product-review-user__name').string
            # self.comment = soup.find('dd', 'n-product-review-item__text').string
            # self.description = f'{self.COMMENT_WORD} {self.comment}'
            self.rating = soup.find('div', 'rating__value').string
            self.process_description_from_markup()
Esempio n. 4
0
 def session(self):
     if self.__session__:
         return self.__session__
     print(self.model.__tablename__, ' session init')
     logger.debug(f'{self.__class__} session init')
     self.recreate_session()
     return self.__session__
Esempio n. 5
0
 def get_reviews_all(self):
     logger.debug('ReviewList::getting data from page 1')
     self.get_reviews_from_page(self.soup)
     for page_num in range(2, self.pages + 1):
         new_url = '{}&{}'.format(self.url, PAGE_PARAM.format(page_num))
         logger.debug(
             'ReviewList::getting data from page {}'.format(page_num))
         soup = get_soup(new_url)
         self.get_reviews_from_page(soup)
Esempio n. 6
0
async def download_link(soup, category_name=None):
    session = Session()
    product_list = soup.find('div',
                             'n-snippet-list').find_all('div',
                                                        'n-snippet-card2',
                                                        recursive=False)

    logger.debug('download_link got {} items'.format(len(product_list)))

    tasks = []
    for i in product_list:
        tasks.append(
            asyncio.create_task(
                download_item(session=session,
                              product_soup=i,
                              category_name=category_name)))

    print('LOOP END')
    await asyncio.gather(*tasks)
Esempio n. 7
0
def download_list_all(url_object):
	category_name, url = url_object.get('category_name'), url_object['url']
	soup = get_soup(url)
	pages_count = get_pages_count(soup)

	logger.debug(f'Found {pages_count} pages for list url {url}')
	logger.debug('download_list::getting data from page 1')
	download_page(soup, category_name)

	for page_num in range(2, pages_count + 1):
		logger.debug(f'download_list::getting data from page {page_num}')
		new_url = '{}&{}'.format(url, PAGE_PARAM.format(page_num))
		soup = get_soup(new_url)
		download_page(soup, category_name)
Esempio n. 8
0
async def download_page(url_object):
    category_name, url = url_object.get('category_name'), url_object['url']
    # try: #todo: remove all tries like that
    soup = await get_soup(url)
    # if not soup:
    # 	logger.error(f'{category_name}, {url}')
    # 	raise TypeError
    # except (ConnectionError, TimeoutError):
    # 	logger.critical(f'Some shit happened to url: {url} :: 64')
    # 	return
    pages_count = get_pages_count(soup)

    logger.debug(f'Found {pages_count} pages for list url {url}')
    logger.debug('download_page::getting data from page 1')

    tasks = [asyncio.create_task(download_link(soup, category_name))]

    for page_num in range(2, pages_count + 1):
        logger.debug(f'download_list::getting data from page {page_num}')
        new_url = '{}&{}'.format(url, PAGE_PARAM.format(page_num))
        soup = await get_soup(new_url)
        tasks.append(asyncio.create_task(download_link(soup, category_name)))

    await asyncio.gather(*tasks)
Esempio n. 9
0
		download_item(i, category_name)


def download_list_all(url_object):
	category_name, url = url_object.get('category_name'), url_object['url']
	soup = get_soup(url)
	pages_count = get_pages_count(soup)

	logger.debug(f'Found {pages_count} pages for list url {url}')
	logger.debug('download_list::getting data from page 1')
	download_page(soup, category_name)

	for page_num in range(2, pages_count + 1):
		logger.debug(f'download_list::getting data from page {page_num}')
		new_url = '{}&{}'.format(url, PAGE_PARAM.format(page_num))
		soup = get_soup(new_url)
		download_page(soup, category_name)


if __name__ == '__main__':
	link_set = get_link_set()
	# link_set = json.load(open(LINK_SET_PATH, 'r', encoding='utf8'))
	logger.info('START\n')
	# random.shuffle(link_set)
	DOWNLOAD_IMAGES = input('DOWNLOAD_IMAGESs? (Y/N)\n').lower() == 'y'
	RESOLVE_OTHER_SHOP_URL = input('RESOLVE_OTHER_SHOP_URL? (Y/N)\n').lower() == 'y'
	while link_set:
		url_object_form_json = link_set.pop()
		logger.debug(f'url_object_form_json {str(url_object_form_json)}')
		download_list_all(url_object_form_json)
Esempio n. 10
0
 def __init__(self, url):
     self.url = url
     self.soup = get_soup(url)
     self.pages = get_pages_count(self.soup)
     logger.debug('Review pages found: {}'.format(self.pages))
     self.get_reviews_all()
Esempio n. 11
0
 def get_reviews_from_page(self, soup):
     review_items = soup.find_all('div', 'n-product-review-item')
     logger.debug('ReviewList got {} items'.format(len(review_items)))
     for i in review_items:
         self.append(Review(i))