def get_comments_with_product_id(product_id): reviews = list() if product_id is None: return reviews if not re.match('^[A-Z0-9]{10}$', product_id): return reviews product_reviews_link = get_product_reviews_url(product_id) so = get_soup(product_reviews_link) max_page_number = so.find_all("li", {'class': 'page-button'}) max_page_number = int(max_page_number[-1].text) if max_page_number else 1 for page_number in range(1, max_page_number + 1): if page_number > 1: product_reviews_link = get_product_reviews_url(product_id, page_number) so = get_soup(product_reviews_link) cr_review_list_so = so.find(id='cm_cr-review_list') if cr_review_list_so is None: logging.info('No reviews for this item.') break reviews_list = cr_review_list_so.find_all(attrs={'class': 'a-section review'}) if len(reviews_list) == 0: logging.info('No more reviews to unstack.') break for review in reviews_list: rating = review.find(attrs={'data-hook': 'review-star-rating'}).attrs['class'][2].split('-')[-1] body = review.find(attrs={'data-hook': 'review-body'}).text title = review.find(attrs={'data-hook': 'review-title'}).text author_url = review.find(attrs={'data-hook': 'genome-widget'}).find('a', href=True) if author_url: author_url = author_url['href'] try: helpful = review.find(attrs={'data-hook': 'helpful-vote-statement'}).text helpful = helpful.strip().split(' ')[0] except: # logging.warning('Could not find any helpful-vote-statement tag.') helpful = '' logging.info('***********************************************') logging.info('TITLE = ' + title) logging.info('RATING = ' + rating) logging.info('CONTENT = ' + body) logging.info('HELPFUL = ' + helpful) logging.info('AUTHOR URL = ' + author_url if author_url else '') logging.info('***********************************************\n') reviews.append({'title': title, 'rating': rating, 'body': body, 'product_id': product_id, 'author_url': author_url}) return reviews
def get_comments_based_on_keyword(search): logging.info('SEARCH = {}'.format(search)) #print(search+"a") url = AMAZON_BASE_URL + '/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords=' + \ search + '&rh=i%3Aaps%2Ck%3A' + search soup = get_soup(url) product_ids = [ div.attrs['data-asin'] for div in soup.find_all('div') if 'data-index' in div.attrs ] logging.info('Found {} items.'.format(len(product_ids))) o = [] for product_id in product_ids: output_filename2, exist1 = get_reviews_filename(product_id) if exist1: print("File with same pid is there....") o.append(output_filename2) else: sleep(0.02) logging.info('product_id is {}.'.format(product_id)) a = input("Do you want to scrape" + format(product_id) + '\n') if a == "y": reviews = get_comments_with_product_id(product_id) logging.info('Fetched {} reviews.'.format(len(reviews))) o.append(persist_comment_to_disk(reviews)) #return product_ids return o
def get_comments_based_on_keyword(search): logging.info('SEARCH = {}'.format(search)) url = AMAZON_BASE_URL + '/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords=' + \ search + '&rh=i%3Aaps%2Ck%3A' + search try: soup = get_soup(url) except BannedException: print('Waiting for chances...') time.sleep(60 * 40) soup = get_soup(url) product_ids = [ div.attrs['data-asin'] for div in soup.find_all('div') if 'data-index' in div.attrs ] logging.info('Found {} items.'.format(len(product_ids))) for product_id in product_ids: logging.info('product_id is {}.'.format(product_id)) reviews = get_comments_with_product_id(product_id) logging.info('Fetched {} reviews.'.format(len(reviews))) persist_comment_to_disk(reviews)
def get_comments_with_product_id(product_id, count): reviews = list() if product_id is None: return reviews if not re.match('^[A-Z0-9]{10}$', product_id): return reviews page_range = int(count / 10) if not count % 10 == 0: page_range += 1 k = 0 for page_number in range(1, page_range + 1): product_reviews_link = get_product_reviews_url(product_id, page_number) so = get_soup(product_reviews_link) cr_review_list_so = so.find(id='cm_cr-review_list') if cr_review_list_so is None: logging.info('No reviews for this item.') break reviews_list = cr_review_list_so.find_all( attrs={'class': 'a-section review'}) if len(reviews_list) == 0: logging.info('No more reviews to unstack.') break for review in reviews_list: rating = review.find(attrs={ 'data-hook': 'review-star-rating' }).attrs['class'][2].split('-')[-1] body = review.find(attrs={'data-hook': 'review-body'}).text title = review.find(attrs={'data-hook': 'review-title'}).text date = review.find(attrs={'data-hook': 'review-date'}).text logging.info('***********************************************') logging.info('no.review = ' + str(k) + " /" + str(count)) logging.info('TITLE = ' + title) logging.info('RATING = ' + rating) logging.info('CONTENT = ' + body) logging.info('DATE = ' + date) logging.info('***********************************************\n') reviews.append({ 'p_id': product_id, 'title': title, 'rating': rating, 'ReviewText': body, 'LastModificationTime': date }) k += 1 return reviews
def get_comments_with_product_id(product_id): reviews = list() if product_id is None: return reviews if not re.match('^[A-Z0-9]{10}$', product_id): return reviews for page_number in range(100): product_reviews_link = get_product_reviews_url(product_id, page_number) so = get_soup(product_reviews_link) cr_review_list_so = so.find(id='cm_cr-review_list') if cr_review_list_so is None: logging.info('No reviews for this item.') break reviews_list = cr_review_list_so.find_all( attrs={'class': 'a-section review'}) if len(reviews_list) == 0: logging.info('No more reviews to unstack.') break for review in reviews_list: rating = review.find(attrs={ 'data-hook': 'review-star-rating' }).attrs['class'][2].split('-')[-1] body = review.find(attrs={'data-hook': 'review-body'}).text title = review.find(attrs={'data-hook': 'review-title'}).text try: helpful = review.find(attrs={ 'data-hook': 'helpful-vote-statement' }).text helpful = helpful.strip().split(' ')[0] except: # logging.warning('Could not find any helpful-vote-statement tag.') helpful = '' logging.info('***********************************************') logging.info('TITLE = ' + title) logging.info('RATING = ' + rating) logging.info('CONTENT = ' + body) logging.info('HELPFUL = ' + helpful) logging.info('***********************************************\n') reviews.append({ 'title': title, 'rating': rating, 'body': body, 'product_id': product_id }) return reviews
def extract_product_ids_from_link(category_link): category_link_soup = get_soup(category_link) products_links_1 = [ a.attrs['href'] for a in category_link_soup.find_all('a') if 'href' in a.attrs and '/gp/product/' in a.attrs['href'] ] products_links_2 = [ a.attrs['href'] for a in category_link_soup.find_all('a') if 'href' in a.attrs and '/dp/' in a.attrs['href'] ] products_links = products_links_1 + products_links_2 products_ids = list(map(extract_product_id, products_links)) products_ids = list(filter(None.__ne__, products_ids)) # remove None values return products_ids
def get_comments_based_on_keyword(search): logging.info('SEARCH = {}'.format(search)) url = 'http://www.amazon.co.jp/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords=' + \ search + '&rh=i%3Aaps%2Ck%3A' + search soup = get_soup(url) items = [] for a in soup.find_all('a', class_='s-access-detail-page'): if a.find('h2') is not None and validators.url(a.get('href')): name = str(a.find('h2').string) link = a.get('href') items.append((link, name)) logging.info('Found {} items.'.format(len(items))) for (link, name) in items: logging.debug('link = {}, name = {}'.format(link, name)) product_id = extract_product_id(link) get_comments_with_product_id(product_id)
def get_comments_with_product_id(product_id): reviews = list() if product_id is None: return reviews if not re.match('^[A-Z0-9]{10}$', product_id): return reviews f = open('comments/' + product_id + '.txt', 'w') for page_number in range(100): product_reviews_link = get_product_reviews_url(product_id, page_number) so = get_soup(product_reviews_link) cr_review_list_so = so.find(id='cm_cr-review_list') if cr_review_list_so is None: logging.info('No reviews for this item.') break reviews_list = cr_review_list_so.find_all( attrs={'class': 'a-section review'}) if len(reviews_list) == 0: logging.info('No more reviews to unstack.') break for review in reviews_list: rating = review.find(attrs={ 'data-hook': 'review-star-rating' }).attrs['class'][2].split('-')[-1] body = review.find(attrs={'data-hook': 'review-body'}).text title = review.find(attrs={'data-hook': 'review-title'}).text body = body.encode('ascii', 'ignore').decode('ascii') f.write(body) f.write('\n') reviews.append(body) return reviews
def get_comments_with_product_id(product_id): reviews = list() if product_id is None: return reviews if not re.match('^[A-Z0-9]{10}$', product_id): return reviews product_reviews_link = get_product_reviews_url(product_id) so = get_soup(product_reviews_link) max_page_number = so.find(attrs={'data-hook': 'total-review-count'}) if max_page_number is None: return reviews # print(max_page_number.text) max_page_number = ''.join([el for el in max_page_number.text if el.isdigit()]) # print(max_page_number) max_page_number = int(max_page_number) if max_page_number else 1 max_page_number *= 0.1 # displaying 10 results per page. So if 663 results then ~66 pages. max_page_number = math.ceil(max_page_number) for page_number in range(1, max_page_number + 1): if page_number > 1: product_reviews_link = get_product_reviews_url(product_id, page_number) so = get_soup(product_reviews_link) cr_review_list_so = so.find(id='cm_cr-review_list') if cr_review_list_so is None: logging.info('No reviews for this item.') break reviews_list = cr_review_list_so.find_all('div', {'data-hook': 'review'}) if len(reviews_list) == 0: logging.info('No more reviews to unstack.') break for review in reviews_list: rating = review.find(attrs={'data-hook': 'review-star-rating'}).attrs['class'][2].split('-')[-1].strip() body = review.find(attrs={'data-hook': 'review-body'}).text.strip() title = review.find(attrs={'data-hook': 'review-title'}).text.strip() author_url = review.find(attrs={'data-hook': 'genome-widget'}).find('a', href=True) review_url = review.find(attrs={'data-hook': 'review-title'}).attrs['href'] review_date = review.find(attrs={'data-hook': 'review-date'}).text.strip() if author_url: author_url = author_url['href'].strip() try: helpful = review.find(attrs={'data-hook': 'helpful-vote-statement'}).text.strip() helpful = helpful.strip().split(' ')[0] except: # logging.warning('Could not find any helpful-vote-statement tag.') helpful = '' logging.info('***********************************************') logging.info('TITLE = ' + title) logging.info('RATING = ' + rating) logging.info('CONTENT = ' + '\n'.join(textwrap.wrap(body, 80))) logging.info('HELPFUL = ' + helpful) logging.info('AUTHOR URL = ' + author_url if author_url else '') logging.info('REVIEW URL = ' + review_url if review_url else '') logging.info('REVIEW DATE = ' + review_date if review_date else '') logging.info('***********************************************\n') reviews.append({'title': title, 'rating': rating, 'body': body, 'product_id': product_id, 'author_url': author_url, 'review_url': review_url, 'review_date': review_date, }) return reviews
def get_comments_with_product_id(product_id): reviews = list() if product_id is None: return reviews if not re.match('^[A-Z0-9]{10}$', product_id): return reviews product_reviews_link = get_product_reviews_url(product_id) so = get_soup(product_reviews_link) max_page_number = so.find(attrs={'data-hook': 'total-review-count'}) if max_page_number is None: return reviews # print(max_page_number.text) max_page_number = ''.join([el for el in max_page_number.text if el.isdigit()]) # print(max_page_number) max_page_number = int(max_page_number) if max_page_number else 1 max_page_number *= 0.1 # displaying 10 results per page. So if 663 results then ~66 pages. max_page_number = math.ceil(max_page_number) rev_count = 0 for page_number in range(1, max_page_number + 1): if page_number > 1: product_reviews_link = get_product_reviews_url(product_id, page_number) so = get_soup(product_reviews_link) # Skip the page if retry limit reached if 'captcha' in str(so): continue cr_review_list_so = so.find(id='cm_cr-review_list') if cr_review_list_so is None: logging.info('No reviews for this item.') break reviews_list = cr_review_list_so.find_all('div', {'data-hook': 'review'}) if len(reviews_list) == 0: logging.info('No more reviews to unstack.') break for review in reviews_list: try: rating = review.find(attrs={'data-hook': 'review-star-rating'}).attrs['class'][2].split('-')[-1].strip() body = review.find(attrs={'data-hook': 'review-body'}).text.strip() title = review.find(attrs={'data-hook': 'review-title'}).text.strip() author_url = review.find(attrs={'data-hook': 'genome-widget'}).find('a', href=True) review_url = review.find(attrs={'data-hook': 'review-title'}).attrs['href'] review_date = review.find(attrs={'data-hook': 'review-date'}).text.strip() except: logging.info('!!! Unpacking review failed!') logging.info('With review' + str(review)) if author_url: try: tmp = author_url['href'].strip() except TypeError: logging.info('!!! Unpacking author_url failed!') logging.info('Using raw' + str(author_url)) tmp = author_url author_url = tmp try: helpful = review.find(attrs={'data-hook': 'helpful-vote-statement'}).text.strip() helpful = helpful.strip().split(' ')[0] except: # logging.warning('Could not find any helpful-vote-statement tag.') helpful = '' logging.info('***********************************************') logging.info('TITLE = ' + title) logging.info('RATING = ' + rating) logging.info('CONTENT = ' + '\n'.join(textwrap.wrap(body, 80))) logging.info('HELPFUL = ' + helpful) logging.info('AUTHOR URL = ' + author_url if author_url else '') logging.info('REVIEW URL = ' + review_url if review_url else '') logging.info('REVIEW DATE = ' + review_date if review_date else '') logging.info('***********************************************\n') reviews.append({'title': title, 'rating': rating, 'body': body, 'product_id': product_id, 'author_url': author_url, 'review_url': review_url, 'review_date': review_date, }) rev_count += 1 if (rev_count == CACHE_CHECK): with open(CACHE_FILE, 'w', encoding='utf-8') as fp: json.dump(reviews, fp, sort_keys=True, indent=4, ensure_ascii=False) rev_count = 0 return reviews
def get_random_product_ids(output_filename): # this function has a random behavior! It's like restoring from a checkpoint # because a new call will yield new values. logging.info('Writing to {}'.format(output_filename)) with open(output_filename, 'w') as o: main_category_page = get_soup(AMAZON_BASE_URL + '/gp/site-directory/ref=nav_shopall_btn') # can have more by clicking on those buttons. category_links_soup = main_category_page.find_all( 'a', {'class': 'nav_a'}) category_links = [a.attrs['href'] for a in category_links_soup] all_product_ids = set() more_category_links = list(category_links) for it, category_link in enumerate(category_links): try: logging.info('({}/{}) get as many links as we can.'.format( it, len(category_links))) category_link_soup = get_soup(category_link) new_links = [ a.attrs['href'] for a in category_link_soup.find_all('a') if 'href' in a.attrs and a.attrs['href'].startswith('/s/') ] # or /b/ more_category_links.extend(new_links) logging.info('{} links found so far.'.format( len(more_category_links))) except BannedException as be: raise be except Exception as e: logging.error('Exception occurred. Skipping') logging.error(e) random.shuffle(more_category_links) it = 0 while len(more_category_links) > 0: it += 1 logging.info('Stack length = {}'.format(len(more_category_links))) category_link = more_category_links.pop() try: logging.info('({}/{}) get as many products as we can.'.format( it, len(more_category_links))) cur_product_ids = extract_product_ids_from_link(category_link) logging.info(cur_product_ids) for product_id in cur_product_ids: if product_id not in all_product_ids: all_product_ids.add(product_id) o.write('{}\n'.format(product_id)) o.flush() logging.info('{} products found at this step.'.format( len(cur_product_ids))) logging.info('{} unique products found so far.'.format( len(all_product_ids))) if len(cur_product_ids) > 0: for jj in range(2, 50): if 'page' in category_link: break more_category_links.append(category_link + '&page={}'.format(jj)) except BannedException as be: raise be except Exception as e: logging.error('Exception occurred. Skipping') logging.error(e)
def get_comments_with_product_id(product_id, skip_num): if product_id is None: return False if not re.match('^[A-Z0-9]{10}$', product_id): return False product_reviews_link = get_product_reviews_url(product_id) so = get_soup(product_reviews_link) product_title = so.find(attrs = {'data-hook': 'product-link'}) if product_title is None: product_title = 'unknown' else: product_title = product_title.text logging.info('product title: {}'.format(product_title)) max_page_number = so.find(attrs={'data-hook': 'total-review-count'}) if max_page_number is None: return False # print(max_page_number.text) max_page_number = ''.join([el for el in max_page_number.text if el.isdigit()]) # print(max_page_number) max_page_number = int(max_page_number) if max_page_number else 1 skip_num = skip_num if skip_num < max_page_number else 1 max_page_number *= 0.1 # displaying 10 results per page. So if 663 results then ~66 pages. skip_num *=0.1 max_page_number = math.ceil(max_page_number) min_page_number = math.ceil(skip_num) for page_number in range(min_page_number, max_page_number + 1): logging.info('{:<10s} {:2.1f}% page {} of {}'.format( ('*'*math.floor(page_number/max_page_number*10)).ljust(10,'.'), page_number/max_page_number*100, page_number, max_page_number) ) if page_number > 1: product_reviews_link = get_product_reviews_url(product_id, page_number) so = get_soup(product_reviews_link) cr_review_list_so = so.find(id='cm_cr-review_list') if cr_review_list_so is None: logging.info('No reviews for this item.') break reviews_list = cr_review_list_so.find_all('div', {'data-hook': 'review'}) if len(reviews_list) == 0: logging.info('No more reviews to unstack.') break for review in reviews_list: rating = review.find(attrs={'data-hook': 'review-star-rating'}).attrs['class'][2].split('-')[-1].strip() body = review.find(attrs={'data-hook': 'review-body'}).text.strip() title = review.find(attrs={'data-hook': 'review-title'}).text.strip() author_url = review.find(attrs={'data-hook': 'genome-widget'}).find('a', href=True) review_url = ('{}{}'.format(AMAZON_BASE_URL, review.find(attrs={'data-hook': 'review-title'}).attrs['href'])) review_date = review.find(attrs={'data-hook': 'review-date'}).text.strip() if author_url: author_url = ('{}{}'.format(AMAZON_BASE_URL, author_url['href'].strip())) try: helpful = review.find(attrs={'data-hook': 'helpful-vote-statement'}).text.strip() helpful = helpful.strip().split(' ')[0] except: # logging.warning('Could not find any helpful-vote-statement tag.') helpful = '0' print( '{:<20s}'.format(review_date if review_date else '--/--/----') + \ '\tRating:' + rating + \ '\t ' + title) review_row = { 'title': title, 'rating': rating, 'body': body, 'helpful': helpful, 'product_id': product_id, 'author_url': author_url, 'review_url': review_url, 'review_date': review_date, 'product_title': product_title } persist_comment_to_disk_in_csv(review_row) return True