Ejemplo n.º 1
0
def reviewmain():
    input_file_name = 'data/reviews.xlsx'
    output_file_name = 'data/reviews_out'
    input_sheet_name = 'reviews'
    output_sheet_name = 'reviews'
    
    #Initialize from given settings
    book_in = open_workbook(input_file_name)
    sheet_in = book_in.sheet_by_name(input_sheet_name)
    
    amzn = AmazonScraper(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_ASSOCIATE_TAG)
    book_out = Workbook()
    sheet_out = book_out.add_sheet(output_sheet_name)
    
    ids = sheet_in.col_values(0,1)
    
    io = input('starting point?')
    i = io
    
    
    while i < len(ids):
        row = ids[i][:-1]
        print 'Item ',i+1
        result = 0
        count = 0.0
        for j in row.split(','):
            r = amzn.review(Id=j)
            count +=1
            result += r.rating*5
        add_data(sheet_out, i, [result/count])
        book_out.save(output_file_name + 'helpdec19.xls')
        i += 1
Ejemplo n.º 2
0
def update_reviews(asin_list):
    for asin in asin_list:
        f = open(os.path.dirname(os.path.realpath(__file__)) + "/keys/aws_keys.json")
        configs = json.loads(f.read())
        amzn = AmazonScraper(configs["aws_public_key"], configs["aws_secret_key"], configs["product_api_tag"])
        try:
            p = amzn.lookup(ItemId=asin)
        except amazon.api.AsinNotFound as e:
            continue
        reviews = p.reviews()
        dates = queries.find_date_for_review(asin)
        media_type = queries.find_type_by_id(asin)
        unix_dates = []
        for date in dates:
            unix_dates.append(get_date(date))
        date = max(unix_dates)
        update = False
        for review in reviews:
            if date < int(review.date):  #check if asin needs updating
                print("needs updating")
                update = True
        list_of_review_dicts =[]
        #if the product has new reviews get them from amazon
        if(update):
            all_reviews = list(reviews)
            for review in all_reviews:  #get all reviews and add in values into the dictionary
                 product_api = aws_module.setup_product_api()
                 comment_dict = dict()
                 comment_dict["text"] = url_scrape.parser(review.url)
                 comment_dict["unixtime"] = int(review.date)
                 list_of_review_dicts.append(comment_dict)
        return data_ingester.handleReview(asin, list_of_review_dicts, product_api, media_type)
Ejemplo n.º 3
0
def main():
    #user settings
    input_file_name = 'data/input.xlsx'
    output_file_name = 'data/output_data'
    input_sheet_name = 'product_list'
    output_sheet_name = 'processed_data'
    
    number_of_items = 100

    #Initialize from given settings
    book_in = open_workbook(input_file_name)
    sheet_in = book_in.sheet_by_name(input_sheet_name)
    
    #Get list of items from excel file
    ids = sheet_in.col_values(0,1)
    product_types = sheet_in.col_values(1,1)
    
    io = input('starting point?')
    i = io
    amzn = AmazonScraper(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_ASSOCIATE_TAG)
    book_out = Workbook()
    sheet_out = book_out.add_sheet(output_sheet_name)
    add_data_headers(sheet_out)
    p_count = 0
    
    #iterate through items
    while i < len(ids):
        p = amzn.lookup(ItemId=ids[i])
        p_count += 1
        print 'Processing', p_count
        p_data = data(amzn, p, product_types[i])
        add_data(sheet_out, p_count, p_data)
        book_out.save(output_file_name + '_' + product_types[i] + '3.xls')
        i = i+1
Ejemplo n.º 4
0
def initialize(prodId):

    amzn = AmazonScraper(acess_key, secret_key, customer_tag, Region='IN')
    p = amzn.lookup(ItemId=prodId)
    rs = amzn.reviews(ItemId=prodId)
    reviews, reviews_title = [], []
    i = 1
    for r in rs:
        fr = r.full_review()
        print_review(fr.title, fr.text, i)
        reviews.append(fr.text)
        reviews_title.append(fr.title)
        i += 1
    prodName = p.title
    for x in range(len(prodName)):
        string = list(prodName)
        if string[x] == '.' or string[x] == '/': string[x] = '-'
        prodName = ''.join(string)
    return reviews, reviews_title, prodName
Ejemplo n.º 5
0
 def setUpClass(cls):
     config = {}
     try:
         config['access_key'] = os.environ['AWS_ACCESS_KEY_ID']
         config['secret_key'] = os.environ['AWS_SECRET_ACCESS_KEY']
         config['associate_tag'] = os.environ['AWS_ASSOCIATE_TAG']
     except:
         raise AssertionError('''
             The following environment variables must be set:
                     "AWS_ACCESS_KEY_ID"
                     "AWS_SECRET_ACCESS_KEY"
                     "AWS_ASSOCIATE_TAG"
         ''')
     cls.amzn = AmazonScraper(MaxQPS=0.5, **config)
Ejemplo n.º 6
0
        time.sleep(random.expovariate(0.1))
        return True


# Amazon api 验证资料。
AUTH_ARGS = [AMZ_ACCESS_KEY, AMZ_SECRET_KEY, AMZ_ASSOC_TAG]
# Amazon api 请求设置。
AUTH_KWARGS = {
    'Region': 'CN',
    'MaxQPS': 0.9,
    'Timeout': 5.0,
    'ErrorHandler': error_handler
}

amz_product = AmazonAPI(*AUTH_ARGS, **AUTH_KWARGS)
amz_scraper = AmazonScraper(*AUTH_ARGS, **AUTH_KWARGS)
amz_nose = bottlenose.Amazon(Parser=lambda text: BeautifulSoup(text, 'xml'),
                             *AUTH_ARGS,
                             **AUTH_KWARGS)


class AmazonLookupItem(object):
    # Wrap all the useful api from AmazonAPI and add some new.
    def __init__(self, asin):
        amz = AmazonAPI(*AUTH_ARGS, **AUTH_KWARGS)
        print('\n>>> Parsing item %s from api...' % asin)
        self.item_api = amz.lookup(ItemId=asin)
        print('Done.\n')

    @property
    def is_prime(self):
Ejemplo n.º 7
0
def main(num_items, heading_level, args):
    """Main routine"""

    # Retrieve the contents of the API key file
    apikey = get_api_config(".amznrc")

    # Create AmazonScraper object using API key
    amznscpr = AmazonScraper(*apikey)

    # Check keyword list entered on the command line
    if len(args) < 1:
        print ("Missing search terms. For usage help: python amznsrch.py -h")
        sys.exit(1)

    # Loop through quoted lists of search terms from command line arguments
    for arg in args:

        # Print search terms as a markdown heading
        srch_terms = str(arg)
        if heading_level > 0 and heading_level < 7:
            print "\n" + "#" * heading_level + " " + srch_terms + "\n"

        # Fetch and return results
        for item in itertools.islice(amznscpr.search(Keywords=srch_terms, SearchIndex="Books"), num_items):

            # Skip if no title, else encode, remove parenthetical text, & quote
            if not item.title:
                continue
            else:
                bktitle = item.title.encode("utf8")
                bktitle = re.sub("\s*[(\[].*[)\]]", "", bktitle)
                bktitlesrch = urllib.quote_plus('"' + bktitle + '"')

            # Encode author, if present, and format for printing
            if not item.author:
                bkauthor = ""
            else:
                bkauthor = "by " + item.author.encode("utf8")

            # Add associate tag to item URL
            bkurl = str(item.url) + "/?tag=" + apikey[2]

            # Construct links as desired
            amzn = "[AMZN](" + bkurl + ")"
            goog = (
                "[GOOG]"
                + "(https://www.google.com/"
                + "search?tbo=p&tbm=bks&q=intitle:"
                + bktitlesrch
                + "&num=10&gws_rd=ssl)"
            )
            spl = (
                "[SPL](https://seattle.bibliocommons.com/search?"
                + "t=title&search_category=title&q="
                + bktitlesrch
                + "&commit=Search)"
            )
            uwl = (
                "[UW](http://alliance-primo.hosted.exlibrisgroup.com/"
                + "primo_library/libweb/action/search.do?fn=search&"
                + "ct=search&vid=UW&vl%28753972432UI0%29=title&"
                + "vl%281UIStartWith0%29=starts+with&vl%28freeText0%29="
                + bktitlesrch
                + "&Submit=Search)"
            )
            # Searching UW Libraries through WorldCat to be deprecated 2015-09
            # uwl = ('[UW](http://uwashington.worldcat.org' +
            #       '/search?q=ti%3A' + bktitlesrch + '&qt=advanced)')

            # Print markdown for title, author, and links as bulleted list item
            print ("- _" + bktitle + "_ " + bkauthor + " ( " + goog + " | " + amzn + " | " + spl + " | " + uwl + " )")
Ejemplo n.º 8
0
def main():
    validate_args()
    scrapper = AmazonScraper('products.yml')
    print('===============================INICIO===============================')
    process_search(scrapper)
    print('===============================FIN===============================')
Ejemplo n.º 9
0
import requests, time, bottlenose, math, urllib, csv
from bs4 import BeautifulSoup
from amazon_scraper import AmazonScraper  # https://github.com/adamlwgriffiths/amazon_scraper
from decimal import Decimal
from app.categories import *
from app_config import *

# Amazon scraper + Amazon API wrapper
amazon = AmazonScraper(app.config['AMZ_API_KEY'], app.config['AMZ_API_SECRET'],
                       app.config['AMZ_ASSOCIATE'])

# Access Raw Amazon XML Response
amazon_raw = bottlenose.Amazon(app.config['AMZ_API_KEY'],
                               app.config['AMZ_API_SECRET'],
                               app.config['AMZ_ASSOCIATE'])


def upc_to_asin(upc):
    time.sleep(1)
    p = amazon.lookup(ItemId=upc, IdType='UPC', SearchIndex='All')
    if type(p) != list:
        asin = [p.asin]
    else:
        asin = []
        count = 0
        while count <= len(p) - 1:
            asin.append(p[count].asin)
            count += 1
    return (asin)

Ejemplo n.º 10
0
 def test_single_asin(self):
     args = AmazonScraper.parse_args(['a'])
     assert args.asin
Ejemplo n.º 11
0
    def setUp(self):

        args = {'asin': 'a'}

        self.amazon_scraper = AmazonScraper(**args)
Ejemplo n.º 12
0
 def test_parse_asins_from_file(self):
     res = AmazonScraper.parse_asins_from_file(
         'amazon_scraper/tests/fixtures/test_parse_asins_from_file.txt')
     assert res == ['aaa', 'bbb']
Ejemplo n.º 13
0
 def test_no_asins(self):
     with self.assertRaises(ValueError):
         AmazonScraper.parse_args([])
Ejemplo n.º 14
0
 def test_asin_file(self):
     with self.assertRaises(ValueError):
         AmazonScraper.parse_args(['a', '--file', 'b'])
Ejemplo n.º 15
0
def main(num_items, heading_level, args):
    """Main routine"""

    # Retrieve the contents of the API key file
    apikey = get_api_config('.amznrc')

    # Create AmazonScraper object using API key
    amznscpr = AmazonScraper(*apikey)

    # Check keyword list entered on the command line
    if len(args) < 1:
        print('Missing search terms. For usage help: python amznsrch.py -h')
        sys.exit(1)

    # Loop through quoted lists of search terms from command line arguments
    for arg in args:

        # Print search terms as a markdown heading
        srch_terms = str(arg)
        if heading_level > 0 and heading_level < 7:
            print '\n' + '#' * heading_level + ' ' + srch_terms + '\n'

        # Fetch and return results
        for item in itertools.islice(
                amznscpr.search(Keywords=srch_terms, SearchIndex='Books'),
                num_items):

            # Skip if no title, else encode, remove parenthetical text, & quote
            if not item.title:
                continue
            else:
                bktitle = item.title.encode('utf8')
                bktitle = re.sub('\s*[(\[].*[)\]]', '', bktitle)
                bktitlesrch = urllib.quote_plus('"' + bktitle + '"')

            # Encode author, if present, and format for printing
            if not item.author:
                bkauthor = ''
            else:
                bkauthor = 'by ' + item.author.encode('utf8')

            # Add associate tag to item URL
            bkurl = str(item.url) + '/?tag=' + apikey[2]

            # Construct links as desired
            amzn = '[AMZN](' + bkurl + ')'
            goog = ('[GOOG]' + '(https://www.google.com/' +
                    'search?tbo=p&tbm=bks&q=intitle:' + bktitlesrch +
                    '&num=10&gws_rd=ssl)')
            spl = ('[SPL](https://seattle.bibliocommons.com/search?' +
                   't=title&search_category=title&q=' + bktitlesrch +
                   '&commit=Search)')
            uwl = ('[UW](http://alliance-primo.hosted.exlibrisgroup.com/' +
                   'primo_library/libweb/action/search.do?fn=search&' +
                   'ct=search&vid=UW&vl%28753972432UI0%29=title&' +
                   'vl%281UIStartWith0%29=starts+with&vl%28freeText0%29=' +
                   bktitlesrch + '&Submit=Search)')
            # Searching UW Libraries through WorldCat to be deprecated 2015-09
            #uwl = ('[UW](http://uwashington.worldcat.org' +
            #       '/search?q=ti%3A' + bktitlesrch + '&qt=advanced)')

            # Print markdown for title, author, and links as bulleted list item
            print('- _' + bktitle + '_ ' + bkauthor + ' ( ' + goog + ' | ' +
                  amzn + ' | ' + spl + ' | ' + uwl + ' )')
Ejemplo n.º 16
0
import requests
import re
import json
from textblob import TextBlob
import itertools

import pickle

# Disable request waring
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

from amazon_scraper import AmazonScraper

amzn = AmazonScraper("AKIAJ5G4TDSHO2D54APQ", "DkMW4edxLB91MGcnDhChkciqj2XumqlySi9yOhT6", "beproject0d-20", Region='IN',
                     MaxQPS=0.9, Timeout=5.0)

app = Flask(__name__)
mysql = MySQL()
app.config['MYSQL_DATABASE_USER'] = '******'
app.config['MYSQL_DATABASE_PASSWORD'] = '******'
app.config['MYSQL_DATABASE_DB'] = 'review_data'
app.config['MYSQL_DATABASE_HOST'] = 'localhost'
mysql.init_app(app)
conn = mysql.connect()
cursor = conn.cursor()
asin_regex = r'/([A-Z0-9]{10})'
isbn_regex = r'/([0-9]{10})'


def get_amazon_item_id(url):
Ejemplo n.º 17
0
        return True


auth_args = [AMZ_ACCESS_KEY, AMZ_SECRET_KEY, AMZ_ASSOC_TAG]
auth_kwargs = {
    'Region': 'CN',
    'MaxQPS': 0.9,
    'Timeout': 5.0,
    'ErrorHandler': error_handler}


# region_options = bottlenose.api.SERVICE_DOMAINS.keys()

amz_product = AmazonAPI(*auth_args, **auth_kwargs)

amz_scraper = AmazonScraper(*auth_args, **auth_kwargs)

amz_nose = bottlenose.Amazon(
    Parser=lambda text: BeautifulSoup(text, 'xml'),
    *auth_args,
    **auth_kwargs)


def print_products(products):
    # product.featrues: List: 商品详情

    with open('result.txt', 'w') as f:
        for i, product in enumerate(products):
            line = "{0}. '{1}'".format(i, product.title.encode('utf8'))
            print(line)
            f.write(line + '\n')
Ejemplo n.º 18
0
resp = requests.get('http://icanhazip.com')
print "My current IP address:", resp.content.strip()

AUTH = requests.auth.HTTPProxyAuth('manutd0707', 'manutd0707')
PROXIES = {'http': 'http://us-dc.proxymesh.com:31280'}
resp = requests.get('http://icanhazip.com',
                    proxies=PROXIES,
                    auth=AUTH,
                    verify=False)
print "My new IP address via ProxyMesh:", resp.content.strip()

AMAZON_ACCESS_KEY = "AMAZON_ACCESS_KEY"
AMAZON_SECRET_KEY = "AMAZON_SECRET_KEY"
AMAZON_ASSOCIATE_TAG = "AMAZON_ASSOCIATE_TAG"

amzn = AmazonScraper(AMAZON_ACCESS_KEY, AMAZON_SECRET_KEY,
                     AMAZON_ASSOCIATE_TAG)
# You need 3 things for the above keys: AWS account (first two codes above),
# Amazon Associates account (final code), and then you need to sign up to use
# the Product Advertising API within the Associates account

filename = "reviews_allinfo.csv"
filename2 = "reviews_notext.csv"

save_path = 'c:/output/'

with open('product_ids.csv', 'rb') as f:
    csv_f = csv.reader(f)
    items = [row[0].strip() for row in csv_f]

for number in items: