def reviewmain(): input_file_name = 'data/reviews.xlsx' output_file_name = 'data/reviews_out' input_sheet_name = 'reviews' output_sheet_name = 'reviews' #Initialize from given settings book_in = open_workbook(input_file_name) sheet_in = book_in.sheet_by_name(input_sheet_name) amzn = AmazonScraper(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_ASSOCIATE_TAG) book_out = Workbook() sheet_out = book_out.add_sheet(output_sheet_name) ids = sheet_in.col_values(0,1) io = input('starting point?') i = io while i < len(ids): row = ids[i][:-1] print 'Item ',i+1 result = 0 count = 0.0 for j in row.split(','): r = amzn.review(Id=j) count +=1 result += r.rating*5 add_data(sheet_out, i, [result/count]) book_out.save(output_file_name + 'helpdec19.xls') i += 1
def update_reviews(asin_list): for asin in asin_list: f = open(os.path.dirname(os.path.realpath(__file__)) + "/keys/aws_keys.json") configs = json.loads(f.read()) amzn = AmazonScraper(configs["aws_public_key"], configs["aws_secret_key"], configs["product_api_tag"]) try: p = amzn.lookup(ItemId=asin) except amazon.api.AsinNotFound as e: continue reviews = p.reviews() dates = queries.find_date_for_review(asin) media_type = queries.find_type_by_id(asin) unix_dates = [] for date in dates: unix_dates.append(get_date(date)) date = max(unix_dates) update = False for review in reviews: if date < int(review.date): #check if asin needs updating print("needs updating") update = True list_of_review_dicts =[] #if the product has new reviews get them from amazon if(update): all_reviews = list(reviews) for review in all_reviews: #get all reviews and add in values into the dictionary product_api = aws_module.setup_product_api() comment_dict = dict() comment_dict["text"] = url_scrape.parser(review.url) comment_dict["unixtime"] = int(review.date) list_of_review_dicts.append(comment_dict) return data_ingester.handleReview(asin, list_of_review_dicts, product_api, media_type)
def main(): #user settings input_file_name = 'data/input.xlsx' output_file_name = 'data/output_data' input_sheet_name = 'product_list' output_sheet_name = 'processed_data' number_of_items = 100 #Initialize from given settings book_in = open_workbook(input_file_name) sheet_in = book_in.sheet_by_name(input_sheet_name) #Get list of items from excel file ids = sheet_in.col_values(0,1) product_types = sheet_in.col_values(1,1) io = input('starting point?') i = io amzn = AmazonScraper(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_ASSOCIATE_TAG) book_out = Workbook() sheet_out = book_out.add_sheet(output_sheet_name) add_data_headers(sheet_out) p_count = 0 #iterate through items while i < len(ids): p = amzn.lookup(ItemId=ids[i]) p_count += 1 print 'Processing', p_count p_data = data(amzn, p, product_types[i]) add_data(sheet_out, p_count, p_data) book_out.save(output_file_name + '_' + product_types[i] + '3.xls') i = i+1
def initialize(prodId): amzn = AmazonScraper(acess_key, secret_key, customer_tag, Region='IN') p = amzn.lookup(ItemId=prodId) rs = amzn.reviews(ItemId=prodId) reviews, reviews_title = [], [] i = 1 for r in rs: fr = r.full_review() print_review(fr.title, fr.text, i) reviews.append(fr.text) reviews_title.append(fr.title) i += 1 prodName = p.title for x in range(len(prodName)): string = list(prodName) if string[x] == '.' or string[x] == '/': string[x] = '-' prodName = ''.join(string) return reviews, reviews_title, prodName
def setUpClass(cls): config = {} try: config['access_key'] = os.environ['AWS_ACCESS_KEY_ID'] config['secret_key'] = os.environ['AWS_SECRET_ACCESS_KEY'] config['associate_tag'] = os.environ['AWS_ASSOCIATE_TAG'] except: raise AssertionError(''' The following environment variables must be set: "AWS_ACCESS_KEY_ID" "AWS_SECRET_ACCESS_KEY" "AWS_ASSOCIATE_TAG" ''') cls.amzn = AmazonScraper(MaxQPS=0.5, **config)
time.sleep(random.expovariate(0.1)) return True # Amazon api 验证资料。 AUTH_ARGS = [AMZ_ACCESS_KEY, AMZ_SECRET_KEY, AMZ_ASSOC_TAG] # Amazon api 请求设置。 AUTH_KWARGS = { 'Region': 'CN', 'MaxQPS': 0.9, 'Timeout': 5.0, 'ErrorHandler': error_handler } amz_product = AmazonAPI(*AUTH_ARGS, **AUTH_KWARGS) amz_scraper = AmazonScraper(*AUTH_ARGS, **AUTH_KWARGS) amz_nose = bottlenose.Amazon(Parser=lambda text: BeautifulSoup(text, 'xml'), *AUTH_ARGS, **AUTH_KWARGS) class AmazonLookupItem(object): # Wrap all the useful api from AmazonAPI and add some new. def __init__(self, asin): amz = AmazonAPI(*AUTH_ARGS, **AUTH_KWARGS) print('\n>>> Parsing item %s from api...' % asin) self.item_api = amz.lookup(ItemId=asin) print('Done.\n') @property def is_prime(self):
def main(num_items, heading_level, args): """Main routine""" # Retrieve the contents of the API key file apikey = get_api_config(".amznrc") # Create AmazonScraper object using API key amznscpr = AmazonScraper(*apikey) # Check keyword list entered on the command line if len(args) < 1: print ("Missing search terms. For usage help: python amznsrch.py -h") sys.exit(1) # Loop through quoted lists of search terms from command line arguments for arg in args: # Print search terms as a markdown heading srch_terms = str(arg) if heading_level > 0 and heading_level < 7: print "\n" + "#" * heading_level + " " + srch_terms + "\n" # Fetch and return results for item in itertools.islice(amznscpr.search(Keywords=srch_terms, SearchIndex="Books"), num_items): # Skip if no title, else encode, remove parenthetical text, & quote if not item.title: continue else: bktitle = item.title.encode("utf8") bktitle = re.sub("\s*[(\[].*[)\]]", "", bktitle) bktitlesrch = urllib.quote_plus('"' + bktitle + '"') # Encode author, if present, and format for printing if not item.author: bkauthor = "" else: bkauthor = "by " + item.author.encode("utf8") # Add associate tag to item URL bkurl = str(item.url) + "/?tag=" + apikey[2] # Construct links as desired amzn = "[AMZN](" + bkurl + ")" goog = ( "[GOOG]" + "(https://www.google.com/" + "search?tbo=p&tbm=bks&q=intitle:" + bktitlesrch + "&num=10&gws_rd=ssl)" ) spl = ( "[SPL](https://seattle.bibliocommons.com/search?" + "t=title&search_category=title&q=" + bktitlesrch + "&commit=Search)" ) uwl = ( "[UW](http://alliance-primo.hosted.exlibrisgroup.com/" + "primo_library/libweb/action/search.do?fn=search&" + "ct=search&vid=UW&vl%28753972432UI0%29=title&" + "vl%281UIStartWith0%29=starts+with&vl%28freeText0%29=" + bktitlesrch + "&Submit=Search)" ) # Searching UW Libraries through WorldCat to be deprecated 2015-09 # uwl = ('[UW](http://uwashington.worldcat.org' + # '/search?q=ti%3A' + bktitlesrch + '&qt=advanced)') # Print markdown for title, author, and links as bulleted list item print ("- _" + bktitle + "_ " + bkauthor + " ( " + goog + " | " + amzn + " | " + spl + " | " + uwl + " )")
def main(): validate_args() scrapper = AmazonScraper('products.yml') print('===============================INICIO===============================') process_search(scrapper) print('===============================FIN===============================')
import requests, time, bottlenose, math, urllib, csv from bs4 import BeautifulSoup from amazon_scraper import AmazonScraper # https://github.com/adamlwgriffiths/amazon_scraper from decimal import Decimal from app.categories import * from app_config import * # Amazon scraper + Amazon API wrapper amazon = AmazonScraper(app.config['AMZ_API_KEY'], app.config['AMZ_API_SECRET'], app.config['AMZ_ASSOCIATE']) # Access Raw Amazon XML Response amazon_raw = bottlenose.Amazon(app.config['AMZ_API_KEY'], app.config['AMZ_API_SECRET'], app.config['AMZ_ASSOCIATE']) def upc_to_asin(upc): time.sleep(1) p = amazon.lookup(ItemId=upc, IdType='UPC', SearchIndex='All') if type(p) != list: asin = [p.asin] else: asin = [] count = 0 while count <= len(p) - 1: asin.append(p[count].asin) count += 1 return (asin)
def test_single_asin(self): args = AmazonScraper.parse_args(['a']) assert args.asin
def setUp(self): args = {'asin': 'a'} self.amazon_scraper = AmazonScraper(**args)
def test_parse_asins_from_file(self): res = AmazonScraper.parse_asins_from_file( 'amazon_scraper/tests/fixtures/test_parse_asins_from_file.txt') assert res == ['aaa', 'bbb']
def test_no_asins(self): with self.assertRaises(ValueError): AmazonScraper.parse_args([])
def test_asin_file(self): with self.assertRaises(ValueError): AmazonScraper.parse_args(['a', '--file', 'b'])
def main(num_items, heading_level, args): """Main routine""" # Retrieve the contents of the API key file apikey = get_api_config('.amznrc') # Create AmazonScraper object using API key amznscpr = AmazonScraper(*apikey) # Check keyword list entered on the command line if len(args) < 1: print('Missing search terms. For usage help: python amznsrch.py -h') sys.exit(1) # Loop through quoted lists of search terms from command line arguments for arg in args: # Print search terms as a markdown heading srch_terms = str(arg) if heading_level > 0 and heading_level < 7: print '\n' + '#' * heading_level + ' ' + srch_terms + '\n' # Fetch and return results for item in itertools.islice( amznscpr.search(Keywords=srch_terms, SearchIndex='Books'), num_items): # Skip if no title, else encode, remove parenthetical text, & quote if not item.title: continue else: bktitle = item.title.encode('utf8') bktitle = re.sub('\s*[(\[].*[)\]]', '', bktitle) bktitlesrch = urllib.quote_plus('"' + bktitle + '"') # Encode author, if present, and format for printing if not item.author: bkauthor = '' else: bkauthor = 'by ' + item.author.encode('utf8') # Add associate tag to item URL bkurl = str(item.url) + '/?tag=' + apikey[2] # Construct links as desired amzn = '[AMZN](' + bkurl + ')' goog = ('[GOOG]' + '(https://www.google.com/' + 'search?tbo=p&tbm=bks&q=intitle:' + bktitlesrch + '&num=10&gws_rd=ssl)') spl = ('[SPL](https://seattle.bibliocommons.com/search?' + 't=title&search_category=title&q=' + bktitlesrch + '&commit=Search)') uwl = ('[UW](http://alliance-primo.hosted.exlibrisgroup.com/' + 'primo_library/libweb/action/search.do?fn=search&' + 'ct=search&vid=UW&vl%28753972432UI0%29=title&' + 'vl%281UIStartWith0%29=starts+with&vl%28freeText0%29=' + bktitlesrch + '&Submit=Search)') # Searching UW Libraries through WorldCat to be deprecated 2015-09 #uwl = ('[UW](http://uwashington.worldcat.org' + # '/search?q=ti%3A' + bktitlesrch + '&qt=advanced)') # Print markdown for title, author, and links as bulleted list item print('- _' + bktitle + '_ ' + bkauthor + ' ( ' + goog + ' | ' + amzn + ' | ' + spl + ' | ' + uwl + ' )')
import requests import re import json from textblob import TextBlob import itertools import pickle # Disable request waring from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) from amazon_scraper import AmazonScraper amzn = AmazonScraper("AKIAJ5G4TDSHO2D54APQ", "DkMW4edxLB91MGcnDhChkciqj2XumqlySi9yOhT6", "beproject0d-20", Region='IN', MaxQPS=0.9, Timeout=5.0) app = Flask(__name__) mysql = MySQL() app.config['MYSQL_DATABASE_USER'] = '******' app.config['MYSQL_DATABASE_PASSWORD'] = '******' app.config['MYSQL_DATABASE_DB'] = 'review_data' app.config['MYSQL_DATABASE_HOST'] = 'localhost' mysql.init_app(app) conn = mysql.connect() cursor = conn.cursor() asin_regex = r'/([A-Z0-9]{10})' isbn_regex = r'/([0-9]{10})' def get_amazon_item_id(url):
return True auth_args = [AMZ_ACCESS_KEY, AMZ_SECRET_KEY, AMZ_ASSOC_TAG] auth_kwargs = { 'Region': 'CN', 'MaxQPS': 0.9, 'Timeout': 5.0, 'ErrorHandler': error_handler} # region_options = bottlenose.api.SERVICE_DOMAINS.keys() amz_product = AmazonAPI(*auth_args, **auth_kwargs) amz_scraper = AmazonScraper(*auth_args, **auth_kwargs) amz_nose = bottlenose.Amazon( Parser=lambda text: BeautifulSoup(text, 'xml'), *auth_args, **auth_kwargs) def print_products(products): # product.featrues: List: 商品详情 with open('result.txt', 'w') as f: for i, product in enumerate(products): line = "{0}. '{1}'".format(i, product.title.encode('utf8')) print(line) f.write(line + '\n')
resp = requests.get('http://icanhazip.com') print "My current IP address:", resp.content.strip() AUTH = requests.auth.HTTPProxyAuth('manutd0707', 'manutd0707') PROXIES = {'http': 'http://us-dc.proxymesh.com:31280'} resp = requests.get('http://icanhazip.com', proxies=PROXIES, auth=AUTH, verify=False) print "My new IP address via ProxyMesh:", resp.content.strip() AMAZON_ACCESS_KEY = "AMAZON_ACCESS_KEY" AMAZON_SECRET_KEY = "AMAZON_SECRET_KEY" AMAZON_ASSOCIATE_TAG = "AMAZON_ASSOCIATE_TAG" amzn = AmazonScraper(AMAZON_ACCESS_KEY, AMAZON_SECRET_KEY, AMAZON_ASSOCIATE_TAG) # You need 3 things for the above keys: AWS account (first two codes above), # Amazon Associates account (final code), and then you need to sign up to use # the Product Advertising API within the Associates account filename = "reviews_allinfo.csv" filename2 = "reviews_notext.csv" save_path = 'c:/output/' with open('product_ids.csv', 'rb') as f: csv_f = csv.reader(f) items = [row[0].strip() for row in csv_f] for number in items: