def scrape_fossils(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup("table", {"class": "sortable"}) items = {} # Stand-alone fossils items["stand_alone"] = {} for tr in table[0]("tr")[1:]: name = tr("td")[0].a.text item = { "image_url": tr("a")[1]['href'], "price": parse_price(tr("td")[2].text), } items["stand_alone"][name] = item # Multi-part fossils items["multi_part"] = {} for tr in table[1]("tr")[1:]: tds = tr("td") if not tds: currentCategory = tr("a")[0].text continue name = tr("td")[0].a.text item = { "image_url": tr("a")[1]['href'], "price": parse_price(tr("td")[2].text), "category": currentCategory } items["multi_part"][name] = item dump_data(items, "museum/" + key) return items
def scrape_furniture_housewares(key): url = URLS["furniture"][key] response = requests.get(url, timeout=5) soup = BeautifulSoup( response.content, "html5lib") # html.parser does not scrape all html contents tables = soup("table", {"class": "roundy"}) items = {} for table_number in range(3, 29): # a - z if len(tables[table_number]("tr")) > 3: # some tables are empty for tr in tables[table_number]("tr")[2:]: name = tr("td")[1].text.strip() item = { "image_url": parse_image_url(tr("td")[0]), "price": { "buy": parse_price(tr("td")[2].text), "sell": parse_price(tr("td")[3].text) }, "source": parse_source(tr("td")[4]), "variations": parse_furniture_variations(tr("td")[5]), "customization": parse_customization(tr("td")[6]), "size_image_url": parse_image_img_url(tr("td")[7]), } items[name] = item dump_data(items, "furniture/" + key) return items
def scrape_shoes(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup.find_all("table", {"class": "roundy"}) items = {} for tableNumber in range(2, 8): for tr in table[tableNumber].find_all("tr")[2:]: name = tr.find_all("td")[0].text.strip() item = { "name": name, # "imageLink": tr.find_all("td")[1].find_all("a")[0]["href"], "priceBuy": parse_price(tr.find_all("td")[2].text), "priceSell": parse_price(tr.find_all("td")[3].text), "source": parse_source(tr.find_all("td")[4]), "variations": parse_variations(tr.find_all("td")[5]), "variationImageLinks": get_image_links(tr.find_all("td")[5].find_all("img")) } if tr.find_all("td")[1].find_all("a"): item["imageLink"] = tr.find_all("td")[1].find_all( "a")[0]["href"] items[name] = item dump_data(items, "clothing/" + key) return items
def scrape_fish(url_key): items = {} url = URLS["museum"][url_key] response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup("table", {"class": "sortable"})[0] item_id = 1 for tr in table("tr")[1:]: name = tr("td")[0].text.strip() item_key = name.replace(" ", "_").replace("-", "_") acnhapi_url = URLS["api"] + "/fish/" + str(item_id) acnhapi_response = urllib.request.urlopen(acnhapi_url) acnhapi_data = json.loads(acnhapi_response.read()) item = { "name": name, "id": item_id, "wiki_url": "https://animalcrossing.fandom.com" + tr("td")[0].find("a")["href"], "icon_url": tr("a")[1]['href'], "image_url": URLS["api"] + "/images/fish/" + str(item_id), "price": parse_price(tr("td")[2].text), "location": tr("td")[3].text.strip(), "shadow_size": tr("td")[4].text.strip(), "time": tr("small")[0].text.split(" & "), "months": { "northern": parse_months([tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16], tr("td")[17]]), "southern": parse_months([tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16], tr("td")[17], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11]]), }, "catch_phrase": acnhapi_data["catch-phrase"], "museum_phrase": acnhapi_data["museum-phrase"], } item_id += 1 items[item_key] = item dump_data(items, "museum/fish")
def scrape_bugs(url_key): # contains all bugs items = {} # get response from url and create soup url = URLS["museum"][url_key] response = requests.get(url, timeout=5) soup = BeautifulSoup(response.content, "html.parser") # find the table to scrape from table = soup("table", {"class": "sortable"})[0] item_id = 1 for tr in table("tr")[1:]: name = tr("td")[0].text.strip() item_key = name.replace(" ", "_").replace("-", "_") acnhapi_url = URLS["api"] + "/bugs/" + str(item_id) acnhapi_response = urllib.request.urlopen(acnhapi_url) acnhapi_data = json.loads(acnhapi_response.read()) item = { "name": name, "id": item_id, "wiki_url": URLS["wiki"] + tr("td")[0].find("a")["href"], "icon_url": tr("a")[1]['href'], "image_url": URLS["api"] + "/images/bugs/" + str(item_id), "price": parse_price(tr("td")[2].text), "location": tr("td")[3].text.strip(), "time": tr("small")[0].text.split(" & "), "months": { "northern": parse_months([tr("td")[5], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16]]), "southern": parse_months([tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16], tr("td")[5], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10]]), }, "catch_phrase": acnhapi_data["catch-phrase"], "museum_phrase": acnhapi_data["museum-phrase"], } item_id += 1 items[item_key] = item dump_data(items, "museum/bugs")
def prepare_product(product): price = product.get('retail_price', '') price = util.parse_price(price) stock = product['inventory_details'].values() count = sum(map(int, stock)) return { "title": '{} / {}'.format(product.get('title'), product.get('description')), "body_html": product.get('body_html'), "published": False, "vendor": product.get('supplier'), "product_type": product.get('subcategory'), # We don't have this information for now # "tags": 'carrand microfiber, microfiber', "weight": product.get('Weight'), "images": [{ 'src': x } for x in product.get('images')], "variants": [{ "price": price, "inventory_quantity": count }] }
def scrape_crafting_others(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") tables = soup("table", {"class": "roundy"}) items = {} for tr in tables[2]("tr")[1:]: name = tr("td")[0].a.text item = { "image_url": tr("a")[1]['href'], "materials": parse_materials(tr("td")[2]), "obtained_from": parse_obtained_from(tr( "td")[4]), # TODO add nook miles .replace(")", "Nook Miles)") "price": parse_price(tr("td")[5].text) } if tr("td")[3].img.get("data-src"): item["size_image_url"] = tr("td")[3].img.get("data-src") elif tr("td")[3].img: item["size_image_url"] = tr("td")[3].img.get("src") # ???? else: item["size_image_url"] = None items[name] = item dump_data(items, "crafting/" + key) return items
def scrape_DIYothers(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup.find_all("table", {"class": "roundy"}) items = {} for tr in table[2].find_all("tr")[1:]: name = tr.find_all("td")[0].a.text item = { "name": name, "imageLink": tr.find_all("a")[1]['href'], "materials": separate_by_br( tr.find_all("td")[2]).lstrip().strip("\n").split(","), "materialsImageLink": get_image_links(tr.find_all("td")[2].find_all("img")), "sizeImageLink": tr.find_all("td")[3].img.get("data-src"), "obtainedFrom": tr.find_all("td")[4].text.strip().strip("\n").splitlines(), "price": parse_price(tr.find_all("td")[5].text) } if (item["obtainedFrom"] == ["Nook Stop (1,000 )" ]): # TODO: rewrite this lazy code item["obtainedFrom"] = ["Nook Stop (1,000 Nook Miles)"] items[name] = item dump_data(items, "crafting/" + key) return items
def priceimgadv(): """Serve the image, with advanced options. The StringIO trick is from here: http://stackoverflow.com/a/10170635/576932 """ price_string = request.args.get('price') output_currency = request.args.get('currency', 'BTC').upper() color_string = request.args.get('color', '0') try: price, input_currency = util.parse_price(price_string) except ValueError: return "Error: bad price argument" try: color = util.parse_color(color_string) except ValueError: return "Error: bad color argument" try: exchange_rate = util.get_exchange_rate(input_currency, output_currency) except KeyError: return 'Error: unsupported currency pair - %s -> %s' % (input_currency, output_currency) except Exception: return 'Error: exchange rate error' output_price = price * exchange_rate img_io = util.get_image_io(output_price, output_currency, color) return send_file(img_io, attachment_filename='img.png')
def _product_row(product): price = product.get('retail_price', '') price = util.parse_price(price) grams = 453.59237 # Grams / lbs rate stock = product['inventory_details'].values() count = sum(map(int, stock)) # Title must be less than 255 # Inventory must be integer return { 'Handle': product['title'], 'Title': '{} / {}'.format(product.get('title'), product.get('description')), 'Body (HTML)': product.get('body_html'), 'Vendor': product.get('supplier'), 'Type': product.get('subcategory'), # 'Tags': Generate them... 'Published': True, # Set to True 'Option1 Name': 'Title', 'Option1 Value': 'Default Title', # 'Variant SKU': Should be keystone part 'Variant Grams': product.get('Weight') * grams, 'Variant Inventory Qty': str(count), 'Variant Inventory Policy': 'deny', 'Variant Fulfillment Service': 'manual', 'Variant Price': price, 'Variant Requires Shipping': True, # Iterate for each image # 'Image Src': , # 'Variant Image', 'Variant Weight Unit': 'lb' }
def scrape_equipments(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup.find_all("table", {"class": "sortable"}) items = {} for tr in table[0].find_all("tr")[1:]: name = tr.find_all("td")[0].a.text item = { "name": name, "imageLink": tr.find_all("a")[1]['href'], "materials": separate_by_br( tr.find_all("td")[2]).lstrip().strip("\n").split(","), "materialsImageLink": get_image_links(tr.find_all("td")[2].find_all("img")), "sizeImageLink": tr.find_all("td")[3].img.get("data-src"), "obtainedFrom": tr.find_all("td")[4].text.strip().strip("\n").splitlines(), "price": parse_price(tr.find_all("td")[5].text) } items[name] = item dump_data(items, "crafting/" + key) return items
def priceimgadv(): """Serve the image, with advanced options. The StringIO trick is from here: http://stackoverflow.com/a/10170635/576932 """ price_string = request.args.get('price') output_currency = request.args.get('currency', 'BTC').upper() color_string = request.args.get('color', '0') try: price, input_currency = util.parse_price(price_string) except ValueError: return "Error: bad price argument" try: color = util.parse_color(color_string) except ValueError: return "Error: bad color argument" try: exchange_rate = util.get_exchange_rate(input_currency, output_currency) except KeyError: return 'Error: unsupported currency pair - %s -> %s' % ( input_currency, output_currency) except Exception: return 'Error: exchange rate error' output_price = price * exchange_rate img_io = util.get_image_io(output_price, output_currency, color) return send_file(img_io, attachment_filename='img.png')
def scrape_umbrellas(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup.find_all("table", {"class": "roundy"}) items = {} for tr in table[2].find_all("tr")[2:]: name = tr.find_all("td")[0].text.strip() item = { "name": name, "imageLink": tr.find_all("td")[1].find_all("a")[0]["href"], "source": parse_source(tr.find_all("td")[2]), "priceBuy": parse_price(tr.find_all("td")[3].text), "priceSell": parse_price(tr.find_all("td")[4].text), } items[name] = item dump_data(items, "clothing/" + key) return items
def scrape_furniture_wallpapers(key): url = URLS["furniture"][key] response = requests.get(url, timeout=5) soup = BeautifulSoup(response.content, "html5lib") # html.parser does not scrape all html contents tables = soup("table", {"class": "roundy"}) items = {} for tr in tables[3]("tr")[2:]: name = tr("td")[1].text.strip() item = { "image_url": parse_image_url(tr("td")[0]), "price": { "buy": parse_price(tr("td")[2].text), "sell": parse_price(tr("td")[3].text) }, "source": parse_source(tr("td")[4]), } items[name] = item dump_data(items, "furniture/" + key) return items
def scrape_music(key): url = URLS.get(key) response = requests.get(url, timeout=5) soup = BeautifulSoup(response.content, "html.parser") tables = soup("table", {"class": "article-table"}) items = {} for tr in tables[0]("tr")[1:]: name = tr("td")[0].text.strip() item_key = name.replace(" ", "_").replace("-", "_") item = { "name": name, "image_url": parse_image_url(tr.find_all("td")[1]), "priceBuy": parse_price(tr.find_all("td")[2].text), "priceSell": parse_price(tr.find_all("td")[3].text), "source": parse_source(tr.find_all("td")[4]) } items[item_key] = item dump_data(items, "music/" + key) return items
def scrape_bugs(key): # take url and return object containing bugs data url = URLS.get(key) # create soup object response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") # find the target table table = soup("table", {"class": "sortable"}) items = {} # go through each tr in the table, ignoring the table header for tr in table[0]("tr")[1:]: # scrape each item name = tr("td")[0].a.text item = { "image_url": tr("a")[1]['href'], "price": parse_price(tr("td")[2].text), "location": tr("td")[3].text.strip(), "time": tr("small")[0].text, "months": { "northern": parse_months([ tr("td")[5], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16] ]), "southern": parse_months([ tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16], tr("td")[5], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10] ]), } } items[name] = item # dump data in a json dump_data(items, "museum/" + key) # return for debugging return items
def scrape_fish(key): # same logic as scrapeBugs url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup("table", {"class": "sortable"}) items = {} for tr in table[0]("tr")[1:]: name = tr("td")[0].a.text item = { "image_url": tr("a")[1]['href'], "price": parse_price(tr("td")[2].text), "location": tr("td")[3].text.strip(), "shadow_size": tr("td")[4].text.strip(), "time": tr("small")[0].text, "months": { "northern": parse_months([ tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11], tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16], tr("td")[17] ]), "southern": parse_months([ tr("td")[12], tr("td")[13], tr("td")[14], tr("td")[15], tr("td")[16], tr("td")[17], tr("td")[6], tr("td")[7], tr("td")[8], tr("td")[9], tr("td")[10], tr("td")[11] ]), } } items[name] = item dump_data(items, "museum/" + key) return items
def scrape_furniture_housewares(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") table = soup.find_all("table", {"class": "roundy"}) items = {} print(table[3]("tr")) for tr in table[3]("tr")[2:]: name = tr.find_all("td")[1].text.strip() item = { "name": name, # "imageLink": tr.find_all("td")[1].find_all("a")[0]["href"], "priceBuy": parse_price(tr.find_all("td")[2].text), "priceSell": parse_price(tr.find_all("td")[3].text), "source": parse_source(tr.find_all("td")[4]), "variations": parse_variations(tr.find_all("td")[5]), "customization": False, "sizeLink": tr.find_all("td")[6].img.get("data-src") } if tr.find_all("td")[1].find_all("a"): item["imageLink"] = tr.find_all("td")[0].find_all("a")[0]["href"] items[name] = item dump_data(items, "furniture/" + key) return items
def scrape_tools(key): url = URLS.get(key) response = (requests.get(url, timeout=5)) soup = BeautifulSoup(response.content, "html.parser") tables = soup("table", {"class": "sortable"}) items = {} for tr in tables[0]("tr")[1:]: name = tr("td")[0].a.text item = { "image_url": tr("a")[1]['href'], "materials": parse_materials(tr("td")[2]), "size_image_url": tr("td")[3].img.get("data-src"), "obtained_from": parse_obtained_from(tr("td")[4]), "price": parse_price(tr("td")[5].text) } items[name] = item dump_data(items, "crafting/" + key) return items
def update_data(old, new): old['my_price'] = new['my_price'] old['jobber_price'] = new['jobber_price'] price = new.get('retail_price') if price: # Increase price by rate% # By default it is set to 20% price = util.parse_price(price) price += price * config.PRICE_RATE old['retail_price'] = price old['inventory'] = new['inventory'] old['inventory_details'] = new['inventory_details'] return old
def priceimgadv(): """Serve the image, with advanced options. The StringIO trick is from here: http://stackoverflow.com/a/10170635/576932 """ price_string = request.args.get('price') output_currency = request.args.get('currency', 'BTC').upper() color_string = request.args.get('color', '0') dpr_string = request.args.get('dpr', '1x') try: price, input_currency = util.parse_price(price_string) except Exception: return 'Error: bad price argument' try: color = util.parse_color(color_string) except Exception: return 'Error: bad color argument' try: dpr = float(dpr_string.rstrip('x')) except Exception: return 'Error: bad dpr argument' if dpr > MAX_DPR: return 'Error: maximum dpr is %d' % MAX_DPR if dpr <= 0: return 'Error: dpr must be greater than 0' try: exchange_rate = util.get_exchange_rate(input_currency, output_currency) except KeyError: return 'Error: unsupported currency pair - %s -> %s' % ( input_currency, output_currency) except Exception: return 'Error: exchange rate error' output_price = price * exchange_rate img_io = util.get_image_io(dpr, output_price, output_currency, color) return send_file(img_io, attachment_filename='img.png')
def priceimgadv(): """Serve the image, with advanced options. The StringIO trick is from here: http://stackoverflow.com/a/10170635/576932 """ price_string = request.args.get('price') output_currency = request.args.get('currency', 'BTC').upper() color_string = request.args.get('color', '0') dpr_string = request.args.get('dpr', '1x') try: price, input_currency = util.parse_price(price_string) except Exception: return 'Error: bad price argument' try: color = util.parse_color(color_string) except Exception: return 'Error: bad color argument' try: dpr = float(dpr_string.rstrip('x')) except Exception: return 'Error: bad dpr argument' if dpr > MAX_DPR: return 'Error: maximum dpr is %d' % MAX_DPR if dpr <= 0: return 'Error: dpr must be greater than 0' try: exchange_rate = util.get_exchange_rate(input_currency, output_currency) except KeyError: return 'Error: unsupported currency pair - %s -> %s' % (input_currency, output_currency) except Exception: return 'Error: exchange rate error' output_price = price * exchange_rate img_io = util.get_image_io(dpr, output_price, output_currency, color) return send_file(img_io, attachment_filename='img.png')
def update_product(data): id_ = data['shopify_id'] print('Updating shopify product', id_) product = shopify.Product.find(id_) stock = data['inventory_details'].values() count = sum(map(int, stock)) for p in product.variants: # Update variant properties... # p.price = data['retail_price'] # This is depreacted p.inventory_management = 'shopify' p.inventory_quantity = count # Parse first price = data.get('retail_price', '') price = util.parse_price(price) p.price = price # Commit changes product.save()
timestamp = datetime.datetime.now().strftime("%Y-%m-%d") url = "https://ligapokemon.com.br/?view=cards/search&card=ed={}&page={}" for collection_arg in sys.argv[1:]: current_page = 1 total_pages = None while current_page: writer = csv.writer(open(f"exports/export_{timestamp}_{collection_arg}.csv", "w")) page = requests.get(url.format(collection_arg, current_page)) soup = BeautifulSoup(page.content, "html.parser") if not total_pages: total_pages = ceil(int(soup.find(id="paginacao-1").b.contents[0]) / 30) # soup = BeautifulSoup(open("backup.html", "r"), "html.parser") rows = soup.find("table", id="cotacao-busca").find_all("tr") # first row is table header, so ignore it for row in rows[1:]: splitted_name = split_name_code(row.find(class_="preto").string) content = { "name_pt": splitted_name["name_pt"], "name_en": splitted_name["name_en"], "code": splitted_name["code"], "price_min": parse_price(row.find(class_="preMen").p.contents[0]), "price_avg": parse_price(row.find(class_="preMed").p.contents[0]), "price_max": parse_price(row.find(class_="preMai").p.contents[0]), } writer.writerow(content.values()) current_page = current_page + 1 if current_page < total_pages else None