Python get_soup Exemples, core_utils.get_soup Python Exemples

Exemple #1

0

Afficher le fichier

def get_comments_with_product_id(product_id):
    reviews = list()
    if product_id is None:
        return reviews
    if not re.match('^[A-Z0-9]{10}$', product_id):
        return reviews

    product_reviews_link = get_product_reviews_url(product_id)
    so = get_soup(product_reviews_link)
    max_page_number = so.find_all("li", {'class': 'page-button'})
    max_page_number = int(max_page_number[-1].text) if max_page_number else 1

    for page_number in range(1, max_page_number + 1):
        if page_number > 1:
            product_reviews_link = get_product_reviews_url(product_id, page_number)
            so = get_soup(product_reviews_link)

        cr_review_list_so = so.find(id='cm_cr-review_list')

        if cr_review_list_so is None:
            logging.info('No reviews for this item.')
            break

        reviews_list = cr_review_list_so.find_all(attrs={'class': 'a-section review'})

        if len(reviews_list) == 0:
            logging.info('No more reviews to unstack.')
            break

        for review in reviews_list:
            rating = review.find(attrs={'data-hook': 'review-star-rating'}).attrs['class'][2].split('-')[-1]
            body = review.find(attrs={'data-hook': 'review-body'}).text
            title = review.find(attrs={'data-hook': 'review-title'}).text
            author_url = review.find(attrs={'data-hook': 'genome-widget'}).find('a', href=True)
            if author_url:
                author_url = author_url['href']
            try:
                helpful = review.find(attrs={'data-hook': 'helpful-vote-statement'}).text
                helpful = helpful.strip().split(' ')[0]
            except:
                # logging.warning('Could not find any helpful-vote-statement tag.')
                helpful = ''

            logging.info('***********************************************')
            logging.info('TITLE    = ' + title)
            logging.info('RATING   = ' + rating)
            logging.info('CONTENT  = ' + body)
            logging.info('HELPFUL  = ' + helpful)
            logging.info('AUTHOR URL  = ' + author_url if author_url else '')
            logging.info('***********************************************\n')
            reviews.append({'title': title,
                            'rating': rating,
                            'body': body,
                            'product_id': product_id,
                            'author_url': author_url})
    return reviews

Exemple #2

0

Afficher le fichier

def get_comments_based_on_keyword(search):
    logging.info('SEARCH = {}'.format(search))
    #print(search+"a")
    url = AMAZON_BASE_URL + '/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords=' + \
          search + '&rh=i%3Aaps%2Ck%3A' + search
    soup = get_soup(url)

    product_ids = [
        div.attrs['data-asin'] for div in soup.find_all('div')
        if 'data-index' in div.attrs
    ]
    logging.info('Found {} items.'.format(len(product_ids)))
    o = []
    for product_id in product_ids:
        output_filename2, exist1 = get_reviews_filename(product_id)
        if exist1:
            print("File with same pid is there....")
            o.append(output_filename2)
        else:
            sleep(0.02)
            logging.info('product_id is {}.'.format(product_id))
            a = input("Do you want to scrape" + format(product_id) + '\n')
            if a == "y":
                reviews = get_comments_with_product_id(product_id)
                logging.info('Fetched {} reviews.'.format(len(reviews)))
                o.append(persist_comment_to_disk(reviews))
    #return product_ids
    return o

Exemple #3

0

Afficher le fichier

Fichier : core_extract_comments.py Projet : tengfei7890/amazon-reviews-scraper

def get_comments_based_on_keyword(search):
    logging.info('SEARCH = {}'.format(search))
    url = AMAZON_BASE_URL + '/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords=' + \
          search + '&rh=i%3Aaps%2Ck%3A' + search
    try:
        soup = get_soup(url)
    except BannedException:
        print('Waiting for chances...')
        time.sleep(60 * 40)
        soup = get_soup(url)

    product_ids = [
        div.attrs['data-asin'] for div in soup.find_all('div')
        if 'data-index' in div.attrs
    ]
    logging.info('Found {} items.'.format(len(product_ids)))
    for product_id in product_ids:
        logging.info('product_id is {}.'.format(product_id))
        reviews = get_comments_with_product_id(product_id)
        logging.info('Fetched {} reviews.'.format(len(reviews)))
        persist_comment_to_disk(reviews)

Exemple #4

0

Afficher le fichier

def get_comments_with_product_id(product_id, count):
    reviews = list()
    if product_id is None:
        return reviews
    if not re.match('^[A-Z0-9]{10}$', product_id):
        return reviews
    page_range = int(count / 10)
    if not count % 10 == 0:
        page_range += 1

    k = 0
    for page_number in range(1, page_range + 1):
        product_reviews_link = get_product_reviews_url(product_id, page_number)
        so = get_soup(product_reviews_link)

        cr_review_list_so = so.find(id='cm_cr-review_list')

        if cr_review_list_so is None:
            logging.info('No reviews for this item.')
            break

        reviews_list = cr_review_list_so.find_all(
            attrs={'class': 'a-section review'})

        if len(reviews_list) == 0:
            logging.info('No more reviews to unstack.')
            break

        for review in reviews_list:
            rating = review.find(attrs={
                'data-hook': 'review-star-rating'
            }).attrs['class'][2].split('-')[-1]
            body = review.find(attrs={'data-hook': 'review-body'}).text
            title = review.find(attrs={'data-hook': 'review-title'}).text
            date = review.find(attrs={'data-hook': 'review-date'}).text

            logging.info('***********************************************')
            logging.info('no.review = ' + str(k) + " /" + str(count))
            logging.info('TITLE    = ' + title)
            logging.info('RATING   = ' + rating)
            logging.info('CONTENT  = ' + body)
            logging.info('DATE  = ' + date)
            logging.info('***********************************************\n')
            reviews.append({
                'p_id': product_id,
                'title': title,
                'rating': rating,
                'ReviewText': body,
                'LastModificationTime': date
            })
            k += 1
    return reviews

Exemple #5

0

Afficher le fichier

def get_comments_with_product_id(product_id):
    reviews = list()
    if product_id is None:
        return reviews
    if not re.match('^[A-Z0-9]{10}$', product_id):
        return reviews
    for page_number in range(100):
        product_reviews_link = get_product_reviews_url(product_id, page_number)
        so = get_soup(product_reviews_link)

        cr_review_list_so = so.find(id='cm_cr-review_list')

        if cr_review_list_so is None:
            logging.info('No reviews for this item.')
            break

        reviews_list = cr_review_list_so.find_all(
            attrs={'class': 'a-section review'})

        if len(reviews_list) == 0:
            logging.info('No more reviews to unstack.')
            break

        for review in reviews_list:
            rating = review.find(attrs={
                'data-hook': 'review-star-rating'
            }).attrs['class'][2].split('-')[-1]
            body = review.find(attrs={'data-hook': 'review-body'}).text
            title = review.find(attrs={'data-hook': 'review-title'}).text
            try:
                helpful = review.find(attrs={
                    'data-hook': 'helpful-vote-statement'
                }).text
                helpful = helpful.strip().split(' ')[0]
            except:
                # logging.warning('Could not find any helpful-vote-statement tag.')
                helpful = ''

            logging.info('***********************************************')
            logging.info('TITLE    = ' + title)
            logging.info('RATING   = ' + rating)
            logging.info('CONTENT  = ' + body)
            logging.info('HELPFUL  = ' + helpful)
            logging.info('***********************************************\n')
            reviews.append({
                'title': title,
                'rating': rating,
                'body': body,
                'product_id': product_id
            })
    return reviews

Exemple #6

0

Afficher le fichier

def extract_product_ids_from_link(category_link):
    category_link_soup = get_soup(category_link)
    products_links_1 = [
        a.attrs['href'] for a in category_link_soup.find_all('a')
        if 'href' in a.attrs and '/gp/product/' in a.attrs['href']
    ]
    products_links_2 = [
        a.attrs['href'] for a in category_link_soup.find_all('a')
        if 'href' in a.attrs and '/dp/' in a.attrs['href']
    ]
    products_links = products_links_1 + products_links_2
    products_ids = list(map(extract_product_id, products_links))
    products_ids = list(filter(None.__ne__,
                               products_ids))  # remove None values
    return products_ids

Exemple #7

0

Afficher le fichier

Fichier : core_extract_comments.py Projet : orchestor/amazon-reviews-scraper

def get_comments_based_on_keyword(search):
    logging.info('SEARCH = {}'.format(search))
    url = 'http://www.amazon.co.jp/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords=' + \
          search + '&rh=i%3Aaps%2Ck%3A' + search
    soup = get_soup(url)
    items = []
    for a in soup.find_all('a', class_='s-access-detail-page'):
        if a.find('h2') is not None and validators.url(a.get('href')):
            name = str(a.find('h2').string)
            link = a.get('href')
            items.append((link, name))
    logging.info('Found {} items.'.format(len(items)))
    for (link, name) in items:
        logging.debug('link = {}, name = {}'.format(link, name))
        product_id = extract_product_id(link)
        get_comments_with_product_id(product_id)

Exemple #8

0

Afficher le fichier

Fichier : core_extract_comments.py Projet : RyanWan/eHack

def get_comments_with_product_id(product_id):
    reviews = list()
    if product_id is None:
        return reviews
    if not re.match('^[A-Z0-9]{10}$', product_id):
        return reviews

    f = open('comments/' + product_id + '.txt', 'w')

    for page_number in range(100):

        product_reviews_link = get_product_reviews_url(product_id, page_number)
        so = get_soup(product_reviews_link)

        cr_review_list_so = so.find(id='cm_cr-review_list')

        if cr_review_list_so is None:
            logging.info('No reviews for this item.')
            break

        reviews_list = cr_review_list_so.find_all(
            attrs={'class': 'a-section review'})

        if len(reviews_list) == 0:
            logging.info('No more reviews to unstack.')
            break

        for review in reviews_list:
            rating = review.find(attrs={
                'data-hook': 'review-star-rating'
            }).attrs['class'][2].split('-')[-1]
            body = review.find(attrs={'data-hook': 'review-body'}).text
            title = review.find(attrs={'data-hook': 'review-title'}).text

            body = body.encode('ascii', 'ignore').decode('ascii')

            f.write(body)
            f.write('\n')
            reviews.append(body)
    return reviews

Exemple #9

0

Afficher le fichier

Fichier : core_extract_comments.py Projet : yue818/amazon-reviews-scraper

def get_comments_with_product_id(product_id):
    reviews = list()
    if product_id is None:
        return reviews
    if not re.match('^[A-Z0-9]{10}$', product_id):
        return reviews

    product_reviews_link = get_product_reviews_url(product_id)
    so = get_soup(product_reviews_link)
    max_page_number = so.find(attrs={'data-hook': 'total-review-count'})
    if max_page_number is None:
        return reviews
    # print(max_page_number.text)
    max_page_number = ''.join([el for el in max_page_number.text if el.isdigit()])
    # print(max_page_number)
    max_page_number = int(max_page_number) if max_page_number else 1

    max_page_number *= 0.1  # displaying 10 results per page. So if 663 results then ~66 pages.
    max_page_number = math.ceil(max_page_number)

    for page_number in range(1, max_page_number + 1):
        if page_number > 1:
            product_reviews_link = get_product_reviews_url(product_id, page_number)
            so = get_soup(product_reviews_link)

        cr_review_list_so = so.find(id='cm_cr-review_list')

        if cr_review_list_so is None:
            logging.info('No reviews for this item.')
            break

        reviews_list = cr_review_list_so.find_all('div', {'data-hook': 'review'})

        if len(reviews_list) == 0:
            logging.info('No more reviews to unstack.')
            break

        for review in reviews_list:
            rating = review.find(attrs={'data-hook': 'review-star-rating'}).attrs['class'][2].split('-')[-1].strip()
            body = review.find(attrs={'data-hook': 'review-body'}).text.strip()
            title = review.find(attrs={'data-hook': 'review-title'}).text.strip()
            author_url = review.find(attrs={'data-hook': 'genome-widget'}).find('a', href=True)
            review_url = review.find(attrs={'data-hook': 'review-title'}).attrs['href']
            review_date = review.find(attrs={'data-hook': 'review-date'}).text.strip()
            if author_url:
                author_url = author_url['href'].strip()
            try:
                helpful = review.find(attrs={'data-hook': 'helpful-vote-statement'}).text.strip()
                helpful = helpful.strip().split(' ')[0]
            except:
                # logging.warning('Could not find any helpful-vote-statement tag.')
                helpful = ''

            logging.info('***********************************************')
            logging.info('TITLE    = ' + title)
            logging.info('RATING   = ' + rating)
            logging.info('CONTENT  = ' + '\n'.join(textwrap.wrap(body, 80)))
            logging.info('HELPFUL  = ' + helpful)
            logging.info('AUTHOR URL  = ' + author_url if author_url else '')
            logging.info('REVIEW URL  = ' + review_url if review_url else '')
            logging.info('REVIEW DATE  = ' + review_date if review_date else '')
            logging.info('***********************************************\n')
            reviews.append({'title': title,
                            'rating': rating,
                            'body': body,
                            'product_id': product_id,
                            'author_url': author_url,
                            'review_url': review_url,
                            'review_date': review_date,
                           })
    return reviews

Exemple #10

0

Afficher le fichier

def get_comments_with_product_id(product_id):
    reviews = list()
    if product_id is None:
        return reviews
    if not re.match('^[A-Z0-9]{10}$', product_id):
        return reviews

    product_reviews_link = get_product_reviews_url(product_id)
    so = get_soup(product_reviews_link)

    max_page_number = so.find(attrs={'data-hook': 'total-review-count'})
    if max_page_number is None:
        return reviews
    # print(max_page_number.text)
    max_page_number = ''.join([el for el in max_page_number.text if el.isdigit()])
    # print(max_page_number)
    max_page_number = int(max_page_number) if max_page_number else 1

    max_page_number *= 0.1  # displaying 10 results per page. So if 663 results then ~66 pages.
    max_page_number = math.ceil(max_page_number)

    rev_count = 0
    for page_number in range(1, max_page_number + 1):
        if page_number > 1:
            product_reviews_link = get_product_reviews_url(product_id, page_number)
            so = get_soup(product_reviews_link)
        # Skip the page if retry limit reached
        if 'captcha' in str(so):
            continue

        cr_review_list_so = so.find(id='cm_cr-review_list')

        if cr_review_list_so is None:
            logging.info('No reviews for this item.')
            break

        reviews_list = cr_review_list_so.find_all('div', {'data-hook': 'review'})

        if len(reviews_list) == 0:
            logging.info('No more reviews to unstack.')
            break

        for review in reviews_list:
            try:
                rating = review.find(attrs={'data-hook': 'review-star-rating'}).attrs['class'][2].split('-')[-1].strip()
                body = review.find(attrs={'data-hook': 'review-body'}).text.strip()
                title = review.find(attrs={'data-hook': 'review-title'}).text.strip()
                author_url = review.find(attrs={'data-hook': 'genome-widget'}).find('a', href=True)
                review_url = review.find(attrs={'data-hook': 'review-title'}).attrs['href']
                review_date = review.find(attrs={'data-hook': 'review-date'}).text.strip()
            except:
                logging.info('!!! Unpacking review failed!')
                logging.info('With review' + str(review))
            if author_url:
                try:
                    tmp = author_url['href'].strip()
                except TypeError:
                    logging.info('!!! Unpacking author_url failed!')
                    logging.info('Using raw' + str(author_url))
                    tmp = author_url
                author_url = tmp
            try:
                helpful = review.find(attrs={'data-hook': 'helpful-vote-statement'}).text.strip()
                helpful = helpful.strip().split(' ')[0]
            except:
                # logging.warning('Could not find any helpful-vote-statement tag.')
                helpful = ''

            logging.info('***********************************************')
            logging.info('TITLE    = ' + title)
            logging.info('RATING   = ' + rating)
            logging.info('CONTENT  = ' + '\n'.join(textwrap.wrap(body, 80)))
            logging.info('HELPFUL  = ' + helpful)
            logging.info('AUTHOR URL  = ' + author_url if author_url else '')
            logging.info('REVIEW URL  = ' + review_url if review_url else '')
            logging.info('REVIEW DATE  = ' + review_date if review_date else '')
            logging.info('***********************************************\n')
            reviews.append({'title': title,
                            'rating': rating,
                            'body': body,
                            'product_id': product_id,
                            'author_url': author_url,
                            'review_url': review_url,
                            'review_date': review_date,
                           })
            rev_count += 1
            if (rev_count == CACHE_CHECK):
                with open(CACHE_FILE, 'w', encoding='utf-8') as fp:
                    json.dump(reviews, fp, sort_keys=True, indent=4,
                              ensure_ascii=False)
                rev_count = 0
    return reviews

Exemple #11

0

Afficher le fichier

def get_random_product_ids(output_filename):
    # this function has a random behavior! It's like restoring from a checkpoint
    # because a new call will yield new values.
    logging.info('Writing to {}'.format(output_filename))
    with open(output_filename, 'w') as o:
        main_category_page = get_soup(AMAZON_BASE_URL +
                                      '/gp/site-directory/ref=nav_shopall_btn')
        # can have more by clicking on those buttons.
        category_links_soup = main_category_page.find_all(
            'a', {'class': 'nav_a'})
        category_links = [a.attrs['href'] for a in category_links_soup]
        all_product_ids = set()
        more_category_links = list(category_links)
        for it, category_link in enumerate(category_links):
            try:
                logging.info('({}/{}) get as many links as we can.'.format(
                    it, len(category_links)))
                category_link_soup = get_soup(category_link)
                new_links = [
                    a.attrs['href'] for a in category_link_soup.find_all('a')
                    if 'href' in a.attrs and a.attrs['href'].startswith('/s/')
                ]  # or /b/
                more_category_links.extend(new_links)
                logging.info('{} links found so far.'.format(
                    len(more_category_links)))
            except BannedException as be:
                raise be
            except Exception as e:
                logging.error('Exception occurred. Skipping')
                logging.error(e)

        random.shuffle(more_category_links)

        it = 0
        while len(more_category_links) > 0:
            it += 1
            logging.info('Stack length = {}'.format(len(more_category_links)))
            category_link = more_category_links.pop()
            try:
                logging.info('({}/{}) get as many products as we can.'.format(
                    it, len(more_category_links)))
                cur_product_ids = extract_product_ids_from_link(category_link)
                logging.info(cur_product_ids)
                for product_id in cur_product_ids:
                    if product_id not in all_product_ids:
                        all_product_ids.add(product_id)
                        o.write('{}\n'.format(product_id))
                        o.flush()
                logging.info('{} products found at this step.'.format(
                    len(cur_product_ids)))
                logging.info('{} unique products found so far.'.format(
                    len(all_product_ids)))

                if len(cur_product_ids) > 0:
                    for jj in range(2, 50):
                        if 'page' in category_link:
                            break
                        more_category_links.append(category_link +
                                                   '&page={}'.format(jj))
            except BannedException as be:
                raise be
            except Exception as e:
                logging.error('Exception occurred. Skipping')
                logging.error(e)

Exemple #12

0

Afficher le fichier

Fichier : core_extract_comments.py Projet : hardyzhuang/amazon-reviews-scraper

def get_comments_with_product_id(product_id, skip_num):

    if product_id is None:
        return False
    if not re.match('^[A-Z0-9]{10}$', product_id):
        return False

    product_reviews_link = get_product_reviews_url(product_id)
    so = get_soup(product_reviews_link)

    product_title = so.find(attrs = {'data-hook': 'product-link'})
    if product_title is None:
        product_title = 'unknown'
    else:
        product_title = product_title.text
    logging.info('product title: {}'.format(product_title))

    max_page_number = so.find(attrs={'data-hook': 'total-review-count'})
    if max_page_number is None:
        return False
    # print(max_page_number.text)
    max_page_number = ''.join([el for el in max_page_number.text if el.isdigit()])
    # print(max_page_number)
    max_page_number = int(max_page_number) if max_page_number else 1
    skip_num = skip_num if skip_num < max_page_number else 1

    max_page_number *= 0.1  # displaying 10 results per page. So if 663 results then ~66 pages.
    skip_num *=0.1
    max_page_number = math.ceil(max_page_number)
    min_page_number = math.ceil(skip_num)

    for page_number in range(min_page_number, max_page_number + 1):
        logging.info('{:<10s}      {:2.1f}%   page {} of {}'.format(
                        ('*'*math.floor(page_number/max_page_number*10)).ljust(10,'.'), 
                        page_number/max_page_number*100,
                        page_number, max_page_number)
                    )
        if page_number > 1:
            product_reviews_link = get_product_reviews_url(product_id, page_number)
            so = get_soup(product_reviews_link)

        cr_review_list_so = so.find(id='cm_cr-review_list')

        if cr_review_list_so is None:
            logging.info('No reviews for this item.')
            break

        reviews_list = cr_review_list_so.find_all('div', {'data-hook': 'review'})

        if len(reviews_list) == 0:
            logging.info('No more reviews to unstack.')
            break

        for review in reviews_list:
            rating = review.find(attrs={'data-hook': 'review-star-rating'}).attrs['class'][2].split('-')[-1].strip()
            body = review.find(attrs={'data-hook': 'review-body'}).text.strip()
            title = review.find(attrs={'data-hook': 'review-title'}).text.strip()
            author_url = review.find(attrs={'data-hook': 'genome-widget'}).find('a', href=True)
            review_url = ('{}{}'.format(AMAZON_BASE_URL, review.find(attrs={'data-hook': 'review-title'}).attrs['href']))
            review_date = review.find(attrs={'data-hook': 'review-date'}).text.strip()
            if author_url:
                author_url = ('{}{}'.format(AMAZON_BASE_URL, author_url['href'].strip()))
            try:
                helpful = review.find(attrs={'data-hook': 'helpful-vote-statement'}).text.strip()
                helpful = helpful.strip().split(' ')[0]
            except:
                # logging.warning('Could not find any helpful-vote-statement tag.')
                helpful = '0'

            print( '{:<20s}'.format(review_date if review_date else '--/--/----') + \
                    '\tRating:' + rating + \
                    '\t ' + title)

            review_row = {  'title': title,
                            'rating': rating,
                            'body': body,
                            'helpful': helpful,
                            'product_id': product_id,
                            'author_url': author_url,
                            'review_url': review_url,
                            'review_date': review_date,
                            'product_title': product_title
                        }
            persist_comment_to_disk_in_csv(review_row)
    return True