def scrape_matchstick(): roaster = "Matchstick" r = requests.get('http://www.matchstickcoffee.com/coffee/') soup = BeautifulSoup(r.content) coffees_for_sale = soup.find_all('div', {'class':'type-post'}) total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] for item in coffees_for_sale: name,description,notes,region,active,size, product_url = [""]*7 price = float() url = item.a['href'] region = item.find(text='Origin:').next_element.strip() noteloc = item.find(text='Notes:').next_element notes = [x.strip() for x in noteloc.split(',')] price_and_size = noteloc.next_element.next_element.text.split(' / ') price = float(price_and_size[0][1:]) size = price_and_size[1] active = True coffee_soup = BeautifulSoup(requests.get(url).content) name = coffee_soup.h1.string # not sure if the descriptions here matter at all description = coffee_soup.find(text='Notes:').next_element.next_element.next_element.next_element.next_element.next_element.next_element # url may had unicode stuff image_url = item.find('img')['src'] image_content = requests.get(image_url).content coffee_data = {'name':name, 'roaster':roaster, 'description':description, 'price':price, 'notes':notes, 'region':region, 'active':active, 'product_page':product_url, 'size':size, 'image': image_content} coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Matchstick New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('Matchstic Updated Results:{} / {}'.format(coffees_updated, total_coffees)) if error_coffees: logging.warning('Matchstick Error coffees are: {}'.format(error_coffees))
def scrape_stumptown(): roaster = 'Stumptown' stumptown = 'https://www.stumptowncoffee.com/coffee' r = requests.get(stumptown) soup = BeautifulSoup(r.content, "html.parser") # class="product-grid _link" coffees_for_sale = soup.find_all('a', {'class':'product-grid _link'}) # keeping track of how many coffees total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] for items in coffees_for_sale: url = items['href'] if not 'trio' in url: name,price,description,notes,region,active,size = [""]*7 product_url = 'https://www.stumptowncoffee.com'+url logging.info("Getting url: {}".format(url)) r = requests.get(product_url) coffee_soup = BeautifulSoup(r.content, "html.parser") # product name h1 class="product _title -desktop theme-color js-pdp-title" name = coffee_soup.h1.string.strip() try: price = float(coffee_soup.find_all('span',{'class':'js-pdp-price'})[0].string) except IndexError as e: logging.warn("Error while getting price for {} : {}".format(name, e)) # div class="product _description description = coffee_soup.find('div', {'class':'product _description'}).p.string try: notes = coffee_soup.h3.string.replace('&',',').lower().split(',') except AttributeError: # no notes found pass region = country_from_name(name) if coffee_soup.h6: # its sold out active = False else: active = True # size in ounces try: size = '{} oz'.format(re.findall('\d+', coffee_soup.find('div', {'class':'product _specs'}).find_all('p')[1].string)[0]) except Exception as e: logging.warn("Error while getting size for {} : {}".format(name, e)) image_url = coffee_soup.select('div.product._image')[0].find('span')['data-src'] image_content = requests.get(image_url).content coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content} coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees) else: total_coffees -= 1 logging.info('Stumptown New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('Stumptown Updated Results:{} / {}'.format(coffees_updated, total_coffees)) if error_coffees: logging.warning('Stumptown Error coffees are: {}'.format(error_coffees))
def scrape_intelli(): urlfetch.set_default_fetch_deadline(10) roaster = 'Intelligentsia' intelli = 'https://www.intelligentsiacoffee.com/catalog/ajax/products/?filter%5Bcat%5D=5' r = requests.get(intelli) soup = BeautifulSoup(r.content, "html.parser") x = r.json() total_coffees = len(x['data']) coffees_entered = 0 coffees_updated = 0 error_coffees = [] for item in x['data']: name, description, notes, region, active, size, product_url = [""] * 7 price = int() product_url = item['productUrl'] logging.info("Getting url: {}".format(product_url)) try: notes = item['flavor_profile_text'].split(',') except KeyError: notes = [""] name = item['original_name'] description = item['description'] region = item['country'] price = float(item['price']) size = '12oz' active = True image_url = 'https://www.intelligentsiacoffee.com/media/catalog/product' + item[ 'small_image'] image_blob = requests.get(image_url).content coffee_data = { 'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_blob } coffees_updated, coffees_entered, error_coffees = add_or_update_coffee( coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Intelligentsia New Results:{} / {}'.format( coffees_entered, total_coffees)) logging.info('Intelligentsia Updated Results:{} / {}'.format( coffees_updated, total_coffees)) if error_coffees: logging.warning( 'Intelligensia Error coffees are: {}'.format(error_coffees))
def scrape_bluebottle(): roaster = 'Blue Bottle' bluebottle = 'https://bluebottlecoffee.com/store/coffee' r = requests.get(bluebottle) soup = BeautifulSoup(r.content, "html.parser") coffees_for_sale = soup.find_all('h2', {'class':'f5 lh-title man'}) total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] ignored = ['Box', 'Kit', 'Subscriptions', 'at Home'] for item in coffees_for_sale: name,description,notes,region,active,size, product_url = [""]*7 price = float() name = item.string if any(word in name for word in ignored): total_coffees -= 1 else: url = item.a['href'] product_url = 'https://bluebottlecoffee.com' + url logging.info("Getting url: {}".format(url)) r = requests.get(product_url) coffee_soup = BeautifulSoup(r.content, "html.parser") active = True price = float(coffee_soup.find('span', {'class':'js-variant-price'}).string) description = coffee_soup.find('p', {'class':'spec-overview'}).string notes = coffee_soup.p.string.lower().split(',') # only works for not single origin region = country_from_name(name) try: details = coffee_soup.find('p', {'class':'spec-details'}).contents[0].strip() if country_from_name(details) != '': region = details except AttributeError: # if it's an espresso, then it's okay to not have region if 'Espresso' in name: region = "" size = coffee_soup.find('select', {'id':'cart_item_model_id'}).option.string.split('Bag')[0] image_url = coffee_soup.img['src'] image_content = requests.get(image_url).content coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content} coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Blue Bottle New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('Blue Bottle Updated Results:{} / {}'.format(coffees_updated, total_coffees)) if error_coffees: logging.warning('Blue Bottle Error coffees are: {}'.format(error_coffees))
def scrape_heart(): roaster = 'Heart' heart_beans = 'http://www.heartroasters.com/collections/beans' heart_url = 'http://www.heartroasters.com' r = requests.get(heart_beans) soup = BeautifulSoup(r.content, "html.parser") all_coffees_for_sale = soup.find_all('a', {'class':'grid__image'}) all_coffee_links = [] for coffee in all_coffees_for_sale: if not 'Subscription' in coffee.find('img')['alt']: all_coffee_links.append("{}{}".format(heart_url, coffee['href'])) total_coffees = len(all_coffee_links) coffees_entered = 0 coffees_updated = 0 error_coffees = [] for url in all_coffee_links: name,price,description,notes,region,active,size = [""] * 7 logging.info("Getting url: {}".format(url)) r = requests.get(url) coffee_soup = BeautifulSoup(r.content, "html.parser") blend = False active = True name = coffee_soup.h1.text.strip() if 'blend' in name.lower(): blend = True size_price = coffee_soup.find('option').text size = size_price.split(" - ")[0] if 'Sold Out' in size_price: active = False price = 0 else: price = float(size_price.split(" - ")[1].replace('USD', '').replace('$', '')) description = coffee_soup.find('div', {'class':'tab-content small'}).find('div',{'id': 'tab1'}).text.encode('utf-8').strip() notes = coffee_soup.find('p',{'class': 'small uppercase flavors'}).text.split(',') if not blend: region = country_from_name(name) # region = coffee_soup.find('div', {'class':'tab-content small'}).find('div',{'id': 'tab1'}).p.text.replace(u'Location:\xa0', '').replace('Location:', '').encode('utf-8') image_url = "http:{}".format(coffee_soup.select('div.slide')[0].find('img')['src']) image_content = requests.get(image_url).content coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': url, 'size': size, 'image': image_content} coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Heart New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('Heart Updated Results:{} / {}'.format(coffees_updated, total_coffees)) if error_coffees: logging.warning('Heart Error coffees are: {}'.format(error_coffees))
def scrape_fortyninth(): roaster = '49th Parallel' base_url = 'http://49thcoffee.com/collections/coffee' r = requests.get(base_url) soup = BeautifulSoup(r.content) coffees_for_sale = soup.find_all('li', {'class':'product-listing'}) total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] ignored = ['Subscription'] for item in coffees_for_sale: name,description,notes,region,active,size, product_url = [""]*7 price = float() name = item.h1.string if any(word in name for word in ignored): total_coffees -= 1 else: url = item.a['href'] product_url = 'http://49thcoffee.com' + url logging.info("Getting url: {}".format(product_url)) r = requests.get(product_url) coffee_soup = BeautifulSoup(r.content) # logging.info("Title: {}".format(coffee_soup.title)) details = coffee_soup.find('div', itemprop='description') d = details.p for sentence in d: description += sentence.string notes = details.h3.string.lower() notes = notes.split(' // ') region = coffee_soup.find('li', {'class':'product-detail-country'}).string.split()[1] size = item.find('data', {'class':'product-size'}).string.strip() price = float(item.find('data', {'class':'product-price'}).string[1:]) active = True image_url = 'https:' + coffee_soup.find('meta', itemprop='image')['content'] image_content = requests.get(image_url).content coffee_data = {'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content} coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('49 Parallel New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('49 Parallel Updated Results:{} / {}'.format(coffees_updated, total_coffees)) if error_coffees: logging.warning('49 Paralell Error coffees are: {}'.format(error_coffees))
def scrape_intelli(): urlfetch.set_default_fetch_deadline(10) roaster = 'Intelligentsia' intelli = 'http://www.intelligentsiacoffee.com/products/coffee' r = requests.get(intelli) soup = BeautifulSoup(r.content, "html.parser") # each coffee under class="grid_4 node node-type-product-coffee node-teaser build-mode-teaser"" coffees_for_sale = soup.find_all('div', {'class': 'node-type-product-coffee'}) # there are duplicates, must check seen = set() uniq_coffees_for_sale = [] for x in coffees_for_sale: if x not in seen: uniq_coffees_for_sale.append(x) seen.add(x) total_coffees = len(uniq_coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] for item in uniq_coffees_for_sale: name, description, notes, region, active, size, product_url = [""] * 7 price = int() product_url = 'http://www.intelligentsiacoffee.com' + item.a['href'] logging.info("Getting url: {}".format(product_url)) notes_list = item.p.contents notes = [notes_list[2].strip().lower(),notes_list[4].strip().lower(),notes_list[6].strip().lower()] name = item.find('div', {'class': 'productListingDescBox'}).strong.string r = requests.get(product_url) coffee_soup = BeautifulSoup(r.content, "html.parser") try: price = float(coffee_soup.find('p', {'class': 'coffeeDetailPrice'}).em.string[1:]) # size gives value + unit size = coffee_soup.find('p', {'class': 'coffeeDetailPrice'}).em.next_sibling.strip()[2:] active = True except AttributeError: logging.info("no price or size for: {}".format(product_url)) # if 'OUT' in coffee_soup.find('p', {'class': 'coffeeDetailPrice'}).string: # its sold out active = False pass blend_or_origin = coffee_soup.find_all('p', {'class': 'coffeeDetailExtraInfoHeader'}) blend_or_origin = [x.string for x in blend_or_origin] # region + country try: region = coffee_soup.find(text='Country').next_element.string except AttributeError: # check if it's a blend if 'Blend' in blend_or_origin: region = 'Blend' pass image_url = coffee_soup.find('div', {'class': 'productPhotoSlide'}).find('img')['src'] image_blob = requests.get(image_url).content description = coffee_soup.find('div', {'class': 'product-body'}).string coffee_data = {'name':name, 'roaster':roaster, 'description':description, 'price':price, 'notes':notes, 'region':region, 'active':active, 'product_page':product_url, 'size':size, 'image': image_blob} coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Intelligentsia New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('Intelligentsia Updated Results:{} / {}'.format(coffees_updated, total_coffees)) if error_coffees: logging.warning('Intelligensia Error coffees are: {}'.format(error_coffees))
def scrape_stumptown(): roaster = 'Stumptown' stumptown = 'https://www.stumptowncoffee.com/coffee' base_url = 'https://www.stumptowncoffee.com' r = requests.get(stumptown) soup = BeautifulSoup(r.content, "html.parser") coffees_for_sale = soup.select('a.product-grid._link') total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] ignored = "trio" for items in coffees_for_sale: url = items['href'] if ignored in url: total_coffees = total_coffees - 1 continue name, price, description, notes, region, active, size = [""] * 7 product_url = base_url + url logging.info("Getting url: {}".format(url)) r = requests.get(product_url) coffee_soup = BeautifulSoup(r.content, "html.parser") name = coffee_soup.h1.string.strip() price = float(coffee_soup.select_one('span.js-pdp-price').text) description = coffee_soup.select_one('div.product._description').p.text try: notes = coffee_soup.h3.string.replace('&', ',').lower().split(',') except AttributeError: # no notes found pass region = country_from_name(name) active = True if coffee_soup.h6: # its sold out active = False try: size = '{} oz'.format( re.findall( '\d+', coffee_soup.select_one('div.product._specs').find_all('p') [1].string)[0]) except Exception as e: logging.warn("Error while getting size for {} : {}".format( name, e)) image_url = coffee_soup.select_one( 'div.product._image span')['data-src'] image_content = requests.get(image_url).content coffee_data = { 'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content } coffees_updated, coffees_entered, error_coffees = add_or_update_coffee( coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Stumptown New Results:{} / {}'.format( coffees_entered, total_coffees)) logging.info('Stumptown Updated Results:{} / {}'.format( coffees_updated, total_coffees)) if error_coffees: logging.warning( 'Stumptown Error coffees are: {}'.format(error_coffees))
def scrape_victrola(): roaster = 'Victrola' victrola = 'http://www.victrolacoffee.com/collections/all-coffee-offerings' r = requests.get(victrola) soup = BeautifulSoup(r.content, "html.parser") coffees_for_sale = soup.find_all('a', {'class':'product-link'}) total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] for item in coffees_for_sale: name,description,notes,region,active,size, product_url = [""]*7 price = int() url = item['href'] product_url = 'http://www.victrolacoffee.com' + url logging.info("Getting url: {}".format(url)) r = requests.get(product_url) coffee_soup = BeautifulSoup(r.content, "html.parser") name = coffee_soup.h2.string if 'Subscription' in name: total_coffees-=1 continue if coffee_soup.find('div', {'class': 'select single'}).find('label').text != 'Size': total_coffees-=1 continue try: size = coffee_soup.find('select').option.string[:4] except AttributeError: logging.info('Cannot find size for {}'.format(name)) continue try: price = float(coffee_soup.find(itemprop='price').string.strip()[2:]) active = True except AttributeError: # its sold out active = False d = coffee_soup.find('h4', {'class':'mobile'}).next_siblings if 'Blend' in name: # different stuff for blends notes = [] region = '' for x in d: description += x.string.strip() else: # sometimes tasting notes just alone # sometimes they are in 'Flavor' # sometimes there are no tasting notes... flavor = coffee_soup(text=re.compile('Flavor:')) if flavor: notes = flavor[1].string.strip()[8:].rstrip(',').lower().split(',') else: try: notes = coffee_soup.find(text="Tasting Notes").next_element.strip()[2:].rstrip(',').lower().split(',') except AttributeError: # can't find any tasting notes notes = [] logging.info('No tasting notes for {}'.format(product_url)) image_url = coffee_soup.find('ul', {'class': 'bx-slider'}).find('img')['src'] image_content = requests.get("http:{}".format(image_url)).content coffee_data = {'name':name, 'roaster':roaster, 'description':description, 'price':price, 'notes':notes, 'region':region, 'active':active, 'product_page':product_url, 'size':size, 'image': image_content} coffees_updated, coffees_entered, error_coffees = add_or_update_coffee(coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Victrola New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('Victrola Updated Results:{} / {}'.format(coffees_updated, total_coffees)) if error_coffees: logging.warning('Victrola Error coffees are: {}'.format(error_coffees))
def scrape_fortyninth(): roaster = '49th Parallel' coffee_url = 'http://49th-parallel.myshopify.com/collections/coffee' base_url = 'https://49th-parallel.myshopify.com' r = requests.get(coffee_url) soup = BeautifulSoup(r.content, "html.parser") coffees_for_sale = soup.find_all('li', {'class': 'product-listing'}) total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] ignored = ['Subscription'] for item in coffees_for_sale: name, description, notes, region, active, size, product_url = [""] * 7 price = float() name = item.h1.string if any(word in name for word in ignored): total_coffees -= 1 else: url = item.a['href'] product_url = base_url + url logging.info("Getting url: {}".format(product_url)) r = requests.get(product_url) coffee_soup = BeautifulSoup(r.content, "html.parser") # logging.info("Title: {}".format(coffee_soup.title)) details = coffee_soup.find('div', itemprop='description') d = coffee_soup.find_all('p', {'class': 'p1'}) if d == []: try: description = details.p.string except AttributeError: description = details.span.string else: description = d[0].string notes = details.h3.string.lower() notes = notes.split(' // ') region = coffee_soup.find( 'li', {'class': 'product-detail-country'}).string.split()[1] size = item.find('data', {'class': 'product-size'}).string.strip() price = float( item.find('data', {'class': 'product-price'}).string[1:]) active = True image_url = 'https:' + coffee_soup.find( 'meta', itemprop='image')['content'] image_content = requests.get(image_url).content coffee_data = { 'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content } coffees_updated, coffees_entered, error_coffees = add_or_update_coffee( coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('49 Parallel New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('49 Parallel Updated Results:{} / {}'.format(coffees_updated, total_coffees)) if error_coffees: logging.warning('49 Paralell Error coffees are: {}'.format( error_coffees))
def scrape_victrola(): roaster = 'Victrola' base_url = 'https://victrola.myshopify.com' victrola = 'https://victrola.myshopify.com/collections/all-coffee-offerings' r = requests.get(victrola) soup = BeautifulSoup(r.content, "html.parser") coffees_for_sale = soup.select('a.product-link') total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] ignored = 'subscription' for item in coffees_for_sale: url = item['href'] if ignored in url: total_coffees = total_coffees - 1 continue name, description, notes, region, active, size, product_url = [""] * 7 price = int() product_url = base_url + url logging.info("Getting url: {}".format(url)) coffee_soup = BeautifulSoup( requests.get(product_url).content, "html.parser") if coffee_soup.find( 'div', {'class': 'select single'}).find('label').text != 'Size': # its sold out? total_coffees = total_coffees - 1 continue name = coffee_soup.h2.string try: size = coffee_soup.find('select').option.string.replace(" ", "")[:4] except AttributeError: logging.info('Cannot find size for {}'.format(name)) continue active = False if coffee_soup.find(itemprop='price'): price = float( coffee_soup.find(itemprop='price').string.strip()[2:]) active = True description_raw = coffee_soup.select_one('h4.mobile').next_siblings if 'Blend' in name: # different stuff for blends notes = [] region = '' for x in description_raw: if x.string: description += x.string.strip() else: # sometimes tasting notes just alone # sometimes they are in 'Flavor' # sometimes there are no tasting notes... flavor = coffee_soup(text=re.compile('Flavor:')) tasting_notes = coffee_soup.find(text="Tasting Notes") if flavor: notes = flavor[1].string.strip()[8:].rstrip(',').lower().split( ',') elif tasting_notes: notes = coffee_soup.find( text="Tasting Notes").next_element.strip()[2:].rstrip( ',').lower().split(',') else: # can't find any tasting notes notes = [] logging.info('No tasting notes for {}'.format(product_url)) # slider image is too big so we're using the twitter one # image_url = coffee_soup.select_one('ul.bx-slider').select_one('img')['src'] image_url = coffee_soup.find("meta", {"name": "twitter:image"})["content"] image_content = requests.get(image_url).content coffee_data = { 'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content } coffees_updated, coffees_entered, error_coffees = add_or_update_coffee( coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Victrola New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('Victrola Updated Results:{} / {}'.format(coffees_updated, total_coffees)) if error_coffees: logging.warning('Victrola Error coffees are: {}'.format(error_coffees))
def scrape_heart(): roaster = 'Heart' heart_beans = 'https://heartcoffee.myshopify.com/collections/beans' heart_url = 'https://heartcoffee.myshopify.com' host = 'heartcoffee.myshopify.com' r = requests.get(heart_beans, headers={"Host": host}) soup = BeautifulSoup(r.content, "html.parser") all_coffees_for_sale = soup.find_all('a', {'class': 'grid__image'}) all_coffee_links = [] for coffee in all_coffees_for_sale: if not 'Subscription' in coffee.find('img')['alt']: all_coffee_links.append("{}{}".format(heart_url, coffee['href'])) total_coffees = len(all_coffee_links) coffees_entered = 0 coffees_updated = 0 error_coffees = [] for url in all_coffee_links: name, price, description, notes, region, active, size = [""] * 7 logging.info("Getting url: {}".format(url)) r = requests.get(url, headers={"Host": host}) coffee_soup = BeautifulSoup(r.content, "html.parser") blend = False active = True name = coffee_soup.h1.text.strip() if 'blend' in name.lower(): blend = True size_price = coffee_soup.find('option').text size = size_price.split(" - ")[0] if 'Sold Out' in size_price: active = False price = 0 else: price = float( size_price.split(" - ")[1].replace('USD', '').replace('$', '')) description = coffee_soup.find('div', { 'class': 'tab-content small' }).find('div', { 'id': 'tab1' }).text.encode('utf-8').strip() notes = coffee_soup.find('p', { 'class': 'small uppercase flavors' }).text.split(',') if not blend: region = country_from_name(name) # region = coffee_soup.find('div', {'class':'tab-content small'}).find('div',{'id': 'tab1'}).p.text.replace(u'Location:\xa0', '').replace('Location:', '').encode('utf-8') image_url = "http:{}".format( coffee_soup.select('div.slide')[0].find('img')['src']) image_content = requests.get(image_url).content coffee_data = { 'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': url, 'size': size, 'image': image_content } coffees_updated, coffees_entered, error_coffees = add_or_update_coffee( coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Heart New Results:{} / {}'.format(coffees_entered, total_coffees)) logging.info('Heart Updated Results:{} / {}'.format( coffees_updated, total_coffees)) if error_coffees: logging.warning('Heart Error coffees are: {}'.format(error_coffees))
def scrape_matchstick(): roaster = "Matchstick" base_url = "https://matchstickcoffee80.myshopify.com" r = requests.get( "https://matchstickcoffee80.myshopify.com/collections/coffee/") soup = BeautifulSoup(r.content, "html.parser") coffees_for_sale = soup.select('div.productItem') total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] ignored = 'subscription' for item in coffees_for_sale: if ignored in item.text.lower(): total_coffees = total_coffees - 1 continue name, description, notes, region, active, size, product_url = [""] * 7 price = float() product_url = item.a['href'] coffee_soup = BeautifulSoup( requests.get(base_url + product_url).content) name = coffee_soup.h1.text location_string = coffee_soup.find(text=re.compile('Location:')) region_string = coffee_soup.find(text=re.compile('Region:')) if 'text' in dir(location_string.next_element): location_str = location_string.next_element.text.strip() else: location_str = location_string.next_element.strip() if 'text' in dir(region_string.next_element): region_str = region_string.next_element.text.strip() else: region_str = region_string.next_element.strip() region = u"{} - {}".format(location_str, region_str) if coffee_soup.find(text=re.compile('Tasting Notes')): notes_string = coffee_soup.find( text=re.compile('Tasting Notes')).next_element notes = [note.strip() for note in notes_string.text.split(',')] else: notes = [] price = float( coffee_soup.select_one('span#ProductPrice').text.strip().strip( '$')) size_container = coffee_soup.select_one('div.swatchBox') size_container.select_one('input[checked]')['value'] active = True product_info = coffee_soup.select_one( 'div.product-info') or coffee_soup.select_one('span.s1') if product_info.find('strong'): product_info.find('strong').decompose() description = product_info.text.strip() image_container = coffee_soup.select_one('div#ProductPhoto') image_url = 'http:' + image_container.find('img')['src'] image_content = requests.get(image_url).content coffee_data = { 'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': base_url + product_url, 'size': size, 'image': image_content } coffees_updated, coffees_entered, error_coffees = add_or_update_coffee( coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Matchstick New Results:{} / {}'.format( coffees_entered, total_coffees)) logging.info('Matchstic Updated Results:{} / {}'.format( coffees_updated, total_coffees)) if error_coffees: logging.warning( 'Matchstick Error coffees are: {}'.format(error_coffees))
def scrape_bluebottle(): roaster = 'Blue Bottle' bluebottle = 'https://bluebottlecoffee.com/store/coffee' r = requests.get(bluebottle) print(r) soup = BeautifulSoup(r.content, "html.parser") coffees_for_sale = soup.select('h2.ma0') total_coffees = len(coffees_for_sale) coffees_entered = 0 coffees_updated = 0 error_coffees = [] ignored = ['Box', 'Kit', 'Subscriptions', 'at Home', 'Pack'] for item in coffees_for_sale: name, description, notes, region, active, size, product_url = [""] * 7 price = float() name = item.string if any(word in name for word in ignored): total_coffees -= 1 else: url = item.a['href'] product_url = 'https://bluebottlecoffee.com' + url logging.info("Getting url: {}".format(url)) r = requests.get(product_url) coffee_soup = BeautifulSoup(r.content, "html.parser") active = True price = float( coffee_soup.find('span', { 'class': 'js-variant-price' }).string) try: description = coffee_soup.find('p', { 'class': 'spec-details' }).string except AttributeError: description = coffee_soup.find('p', { 'class': 'spec-overview' }).string notes = coffee_soup.select('div.mb30')[0].string.split(',') # only works for not single origin region = country_from_name(name) try: details = coffee_soup.find('p', { 'class': 'spec-details' }).contents[0].strip() if country_from_name(details) != '' and len(details) < 10: region = details except AttributeError: # if it's an espresso, then it's okay to not have region if 'Espresso' in name: region = "" try: size = coffee_soup.find('div', {'class': 'grid-col-4'}).text except AttributeError: continue image_url = coffee_soup.img['src'] image_content = requests.get(image_url).content coffee_data = { 'name': name, 'roaster': roaster, 'description': description, 'price': price, 'notes': notes, 'region': region, 'active': active, 'product_page': product_url, 'size': size, 'image': image_content } coffees_updated, coffees_entered, error_coffees = add_or_update_coffee( coffee_data, coffees_updated, coffees_entered, error_coffees) logging.info('Blue Bottle New Results:{} / {}'.format( coffees_entered, total_coffees)) logging.info('Blue Bottle Updated Results:{} / {}'.format( coffees_updated, total_coffees)) if error_coffees: logging.warning( 'Blue Bottle Error coffees are: {}'.format(error_coffees))