def test_amazonscraper_csv_header(): products = amazonscraper.search( keywords="Python", max_product_nb=1) assert "Product title,Rating,Number of customer reviews,Product URL,\ ASIN\n" in str(products.csv())
def main(): product = sys.argv[1] #product name choice = sys.argv[2] num = int(choice) # how many items you want results = amazonscraper.search(product, max_product_nb=num) lst = list(results) lst.sort(key=lambda item: item.rating, reverse=True) top_ten = lst[0:9] # print(top_ten) for result in top_ten: # print("{}".format(result.title)) # print(" - ASIN : {}".format(result.asin)) # print(" - {} out of 5 stars, {} customer reviews".format(result.rating, result.review_nb)) # print(" - {}".format(result.url)) # print(" - Image : {}".format(result.img)) # print() with open("result.txt", 'w+') as file: file.write("{}\n".format(result.title)) file.write(" - ASIN : {}\n".format(result.asin)) file.write(" - {} out of 5 stars, {} customer reviews\n".format( result.rating, result.review_nb)) file.write("- Reviews:\n") data = getReviews(result.url) headerlists = data['headerlists'] reviewlists = data['reviewlists'] for i in range(len(headerlists)): print("hey:" + headerlists[i]) file.write("Review " + str(i) + "{}".format(headerlists[i])) file.write("\n + {}\n\n\n".format(reviewlists[i]))
def test_amazonscraper_not_satisfied_url(): url = "https://raw.githack.com/tducret/\ amazon-scraper-python/master/test/not_satisfied.html" products = amazonscraper.search(search_url=url, max_product_nb=_MAX_PRODUCT_NB) assert len(products) == 0
def test_amazonscraper_get_100_products(): products = amazonscraper.search( keywords="Python", max_product_nb=100) assert len(products) == 100
def test_amazonscraper_get_products_with_keywords(): products = amazonscraper.search( keywords="Python", max_product_nb=_MAX_PRODUCT_NB) assert len(products) == _MAX_PRODUCT_NB
def main(keywords, url, csvseparator, maxproductnb, outputhtml): """ Search for products on Amazon, and extract it as CSV """ products = amazonscraper.search( keywords=keywords, search_url=url, max_product_nb=maxproductnb) print(products.csv(separator=csvseparator)) if (outputhtml != ""): with open(outputhtml, "w") as f: f.write(products.last_html_page)
def test_amazonscraper_get_products_with_url(): url = "https://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=python" products = amazonscraper.search(search_url=url, max_product_nb=_MAX_PRODUCT_NB) assert isinstance(products, amazonscraper.Products) assert len(products) == _MAX_PRODUCT_NB product = products[0] assert isinstance(product, amazonscraper.Product) assert product.title != "" assert product.review_nb != "" assert product.rating != "" assert product.url != ""
def retrieveContent(keyword): results = amazonscraper.search(keyword) for result in results: print("{}".format(result.title)) print(" - ASIN : {}".format(result.asin)) print(" - {} out of 5 stars, {} customer reviews".format(result.rating, result.review_nb)) print(" - {}".format(result.url)) print(" - Image : {}".format(result.img)) print() print("Number of results : %d" % (len(results)))
def posting(): if request.method == 'POST': component = str(request.form['component']) product = str(request.form['product']) search = int(request.form['max']) print(component) print(product) print(search) try: results = amazonscraper.search(product, max_product_nb=search) except Exception as e: print(e) if results is None: print("over") return 0 for result in results: code_no = None if result.asin is None: code_no = '123456' else: code_no = result.asin supply_rate1 = result.rating component_rate1 = ((result.rating) / (result.rating + 1)) * supply_rate1 price1 = result.prices_main supply1 = get_title(result.url) #print(s) print(supply_rate1, component_rate1, price1, supply1, component, code_no) pipe_to_db(component, supply1, code_no, price1, supply_rate1, component_rate1) db = pymysql.connect("localhost", "root", "password", "DELL") cursor = db.cursor() cursor.execute('SELECT * FROM DELLDB ORDER BY SNO DESC LIMIT 10') nested_data = cursor.fetchall() #db.commit() db.close() #return render_template('index.html') return render_template('index2.html', nested_data=nested_data)
def get_rss_amazon(key_words): store = file.Storage('token.json') creds = store.get() if not creds or creds.invalid: flow = client.flow_from_clientsecrets('credentials.json', SCOPES) creds = tools.run_flow(flow, store) service = build('drive', 'v3', http=creds.authorize(Http())) title_list = [] list_rating = [] list_review = [] list_url = [] for i in key_words: results = list(amazonscraper.search(i)) print(i) print(results) for result in results: title_list.append(result.title) list_rating.append(result.rating) list_review.append(result.review_nb) list_url.append(result.url) result = zip(title_list, list_rating, list_review, list_url) feed = feedgenerator.Rss201rev2Feed(title="all events", link="https://www.amazon.com/", description="New in amazon", language="en") for info in result: feed.add_item(title=info[0], link=info[3], description=info[1], unique_id='no') with open('rss_by_keywords_amazon.rss', 'w') as fp: feed.write(fp, 'utf-8') file_metadata = {'name': 'rss_by_keywords_amazon.rss'} media = MediaFileUpload('rss_by_keywords_amazon.rss', mimetype='text/plain', resumable=True) fili = service.files().create(body=file_metadata, media_body=media, fields='id').execute() o = feed.writeString('utf-8') soup = BeautifulSoup(o, "xml") soup = soup.prettify() with open('templates/rss_by_keywords_amazon.rss', 'w') as fp: fp.write(str(soup))
def main(keywords, url, csvseparator, maxproductnb, outputhtml): """ Search for products on Amazon, and extract it as CSV """ # products = amazonscraper.search( # keywords="2060", # search_url=url, # max_product_nb=100) # ORIGINALE # """ Search for products on Amazon, and extract it as CSV """ products = amazonscraper.search(keywords=keywords, search_url=url, max_product_nb=maxproductnb) print(products.csv("estrazione.csv", separator=csvseparator)) if (outputhtml != ""): with open(outputhtml, "w") as f: f.write(products.last_html_page)
def test_amazonscraper_sign_in_suggestion_url(): # or https://www.amazon.com/ref=assoc_res_sw_logo url = "https://www.amazon.com/gp/aw/ref=mw_access" products = amazonscraper.search(search_url=url, max_product_nb=_MAX_PRODUCT_NB) assert len(products) == 0
def test_amazonscraper_invalid_url(): url = "https://0.0.0.0" with pytest.raises(Exception): amazonscraper.search(search_url=url, max_product_nb=_MAX_PRODUCT_NB)
def test_amazonscraper_csv_header(): products = amazonscraper.search(keywords="Python", max_product_nb=1) products.csv('test.csv') with open('test.csv') as f: csv_str = f.read() assert "title,rating,review_nb,img,url,asin,prices_per_unit,units,prices_main" in csv_str
import amazonscraper results = amazonscraper.search("", max_product_nb=2) def get_title(url): ctr = 0 for i in range(len(url)): if url[i] == "/": ctr += 1 if ctr == 3: url_new = url[i + 1:] break for j in range(len(url_new)): if url_new[j] == "/": url_new = url_new[:j] break url_new.replace("-", " ") return url_new with open("test.html", "w") as f: f.write(results.last_html_page) for result in results: print("{}".format(result.title)) print(" - ASIN : {}".format(result.asin)) print(" - {} out of 5 stars, {} customer reviews".format( result.rating, result.review_nb)) print(" link - {}".format(result.url)) print(" - Image : {}".format(result.img)) print(" - prices : {}".format(result.prices_main)) print(" - title : {}".format(result.title))
import amazonscraper results = amazonscraper.search("Python programming", max_product_nb=2) for result in results: print("{}".format(result.title)) print(" - ASIN : {}".format(result.asin)) print( " - {} out of 5 stars, {} customer reviews".format( result.rating, result.review_nb ) ) print(" - {}".format(result.url)) print(" - Image : {}".format(result.img)) print() print("Number of results : %d" % (len(results)))
def main(category_list): logger = logging.getLogger(__name__) time = str(strftime("%Y-%m-%d %H:%M:%S", localtime())) output_dir = "C:/Users/Tan Ye Kai/Documents/Uni work/Y1S2/HackNroll/Scraper/output" # category_list_path = "C:/Users/Tan Ye Kai/Documents/Uni work/Y1S2/HackNroll/Scraper/data/categories.csv" # category_list = global_func.read_csv(category_list_path) # category_list = ['Sweatpants'] for dic in category_list: new_output_dir = global_func.make_directory(output_dir, "files") image_dir = global_func.make_directory(output_dir, "images") category = dic["category_name"] category_id = dic["id"] new_output_dir = global_func.make_directory(new_output_dir, category) if (len(category_list) > 0): print(time + ": Number of categories in list: {}".format( len(category_list))) else: logger.error("Category list not extracted") print(time + ": Scraping.....") results = amazonscraper.search(category, max_product_nb=50) lorem = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." for result in results: # print(result) json_object = json_format.initialise_job_roles() json_object["name"] = result.title json_object["description"] = lorem json_object["category_id"] = category_id json_object["provider"] = "Amazon" price = cleanup_price(result.price) json_object["price"] = price image = cleanup_img(result.img) json_object["image"] = image # print(result.url) json_object["url"] = result.url if (result.rating): rating = convert_to_percentage(float(result.rating)) json_object["rating"] = rating # print("{}".format(result.title)) # print(" - ASIN : {}".format(result.asin)) # print(" - {} out of 5 stars, {} customer reviews".format(result.rating, result.review_nb)) # print(" - {}".format(result.url)) # print(" - Image : {}".format(result.img)) # print(" - Price : {}".format(result.price)) # print() title = (result.title[:40] + '') if len(result.title) > 50 else result.title title = title.replace(" ", "_") title = re.match("^[a-zA-Z0-9_]*$", title) if (title): filename, file_extension = os.path.splitext(image) img_path = global_func.make_filepath(image_dir, filename, file_extension) if (global_func.download_img(result.img, img_path)): title = title.group(0) file_path = global_func.make_filepath( new_output_dir, title, ".json") global_func.make_json(file_path, json_object) else: pass return new_output_dir
import amazonscraper results = amazonscraper.search("corsair") for result in results: print("{}".format(result.title)) print(" - ASIN : {}".format(result.asin)) print(" - {} out of 5 stars, {} customer reviews".format( result.rating, result.review_nb)) print(" - {}".format(result.url)) print(" - Image : {}".format(result.img)) print() print("Number of results : %d" % (len(results)))