def scrape_doordash_images(doordash_code, foodie_id): print("DoordashID:", doordash_code) if (doordash_code == ""): return False soup = pull_doordash_html(doordash_code) clean_soup = soup.find_all('script')[8].get_text().split("\"")[1] clean_soup = clean_soup.replace("\\u0022", "\"").replace("\\u002D", "-") json_data = json.loads(clean_soup) path = script_dir + "/csvfiles/images/" + foodie_id + "-images-doordash/" try: os.makedirs(path) except OSError: pass exceptions = [ 'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from' ] + foodie_id.split('-') json_menu = extract_json(foodie_id) menu_items = save_locally(json_menu, foodie_id) n = 0 for category in json_data['current_menu']['menu_categories']: title = category['title'] cat_items = category['items'] for item in cat_items: image_name = item['name'] image_url = item['image_url'] if image_url == None: continue matched_items = items_in_sentence(image_name, menu_items, 2, foodie_id, exceptions) if (len(matched_items) == 0): continue optimized_items = optimize_list(matched_items, image_name.lower()) print(optimized_items) for optimized_item in optimized_items: filename = foodie_id + "-" + str(n) + ".jpg" urlretrieve(image_url, path + filename) foodie_ids.append(foodie_id) items.append(optimized_item) filenames.append(filename) matches.append(image_name) source_ids.append(doordash_code) n += 1 d = { 'FoodieID': foodie_ids, 'Item': items, 'Filename': filenames, 'Matches': matches, 'DoordashID': source_ids } df = pd.DataFrame(d) df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False)
def fill_in_data(row): json = extract_json(row['FoodieID'])['Info'][0] row['Name'] = json['Name'] row['Address'] = json['AddressLine'] row['City'] = json['City'] row['State'] = json['State'] row['Zipcode'] = json['ZipCode'] return row
def read_items(script_dir, restaurant_tag): filename = script_dir + "/output_menu_items/foodie/" + restaurant_tag + ".txt" if os.path.exists(filename): with open(filename, 'rb') as f: menu_items = pickle.load(f) else: json = extract_json(restaurant_tag) menu_items = save_locally(json, restaurant_tag) return menu_items
def create_output(review, date, foodie_id): output = [] month, day, year = date_breakdown(date) # splits the text with \n split_text = review.split('\n') split_text = [elem.strip() for elem in split_text] # extracts json from database and save menu items locally on your machine json = extract_json(foodie_id) menu_items = save_locally(json, foodie_id) yelpid = get_yelp_id(json) # given json, we can extract yelp_id for item, description, price, category, menu, tag in menu_items['Food']: # iterating through each item for a restaurant matching = [split_text.index(s) for s in split_text if match(s, item)] if(matching != []): # checking if there is a match # appending the paragraph that follows if there is an item best_word = best_match(matching, item, split_text) output.append(["Infatuation", "the-infatuation", yelpid, month, day, year, item, split_text[matching[0] + 1], '', category, menu, split_text[matching[0]]]) # setting up DataFrame s = pd.DataFrame(output, index=None, columns=['Source', 'SourceUserCode', 'SourceRestaurantCode', 'Month', 'Day', 'Year', 'Item', 'Sentence', 'Rating', 'Categories', 'Menu', 'Item Match']) # applying the sentiment analyzer # load_pretrained_models() s['Rating'] = predict(s['Sentence']) # dropping duplicates s = s.drop_duplicates(subset=['Sentence'], keep="first") s = convert_to_ten_scale(s) # saving Excel in csvfiles/comments script_dir = os.path.abspath(os.path.join(__file__ ,"../..")) s.to_excel(script_dir + "/csvfiles/sentences/comments/" + foodie_id + "-infatuation.xlsx", index=False) return len(s)
def run_postmates_image_scraper(postmates_code, foodie_id): #Opening proper webpage chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--incognito") chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(options=chrome_options) wiki = "https://postmates.com/merchant/" + postmates_code driver.get(wiki) n = 0 script_dir = os.path.abspath(os.path.join(__file__, "../..")) path = script_dir + "/csvfiles/images/" + foodie_id + "-images-postmates/" exceptions = [ 'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from' ] + foodie_id.split('-') # menu_items = read_items(script_dir, foodie_id) json = extract_json(foodie_id) menu_items = save_locally(json, foodie_id) foodie_ids = [] items = [] filenames = [] matches = [] elements = driver.find_elements_by_xpath( "//div[@class='product-container css-2ko7m4 e1tw3vxs3']") for element in elements: item_name = element.find_element_by_xpath( ".//h3[@class='product-name css-1yjxguc e1tw3vxs4']" ).get_attribute("innerText") matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions) if (len(matched_items) == 0): continue imgs = element.find_elements_by_xpath( ".//img[@class='css-1hyfx7x e1qfcze94']") for img in imgs: img_src = img.get_attribute("src") print(img_src) optimized_items = optimize_list(matched_items, item_name.lower()) print(optimized_items) for item in optimized_items: if n == 0: print("test") try: os.makedirs(path) except OSError: pass filename = foodie_id + "-" + str(n) + ".jpg" webp_finder = img_src.find('format=webp') print(webp_finder) img_src = img_src[:webp_finder] print(img_src) save_img_url(img_src, path + filename) foodie_ids.append(foodie_id) items.append(item) filenames.append(filename) matches.append(item_name) n += 1 print(n) driver.close() if n > 0: d = { 'FoodieID': foodie_ids, 'Item': items, 'Filename': filenames, 'Matches': matches } df = pd.DataFrame(d) df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False)
def main(f_suffix): df = read_input(f_suffix) for i, row in df.iterrows(): if (isNaN(row['FoodieID'])): row['FoodieID'] = create_foodie_id(row) try: initialize_sources() if (row['Status'] == "Imported"): continue if "Scrape Info" in row['Status']: scrape_info(row, f_suffix, row['Neighborhood']) row['Status'] = 'Completed Info Scrape' elif "Scrape Menu" in row['Status']: source, source_id = scrape_menu(row) scrape(row, i, f_suffix) if "New Request" in row['Status']: row = scrape(row, i, f_suffix) if "Menu Available" in row['Status']: row = scrape(row, i, f_suffix) if ("Manually add YelpID" not in row['Status']): row['Status'] = match_analyze_compile(row) if "More Reviews" in row['Status']: row = scrape(row, i, f_suffix) if ("Manually add YelpID" not in row['Status']): row['Status'] = match_analyze_compile(row) if "Generate PDF" in row['Status']: run_generate_analytics_pdf(row, row['FoodieID']) if "Ready to Import" in row['Status']: try: add_to_database(row['FoodieID']) row['Status'] = 'Imported' except: row['Status'] = "Failed to Import" if "Caviar Images" in row['Status']: print("Running Caviar image scraper") if (isNaN(row['CaviarID'])): caviar_id = gather_id('Caviar', sources['Caviar'], row) else: caviar_id = row['CaviarID'] if (caviar_id == ''): row['Status'] = 'Could not find CaviarID' else: row['Status'] = run_caviar_image_scraper( caviar_id, row['FoodieID']) df.loc[i, 'CaviarID'] = caviar_id # row['Status'] = 'Added Caviar Imgs' if "Instagram Images" in row['Status']: print("Running Instagram image scraper") if (isNaN(row['InstagramID'])): instagram_id = gather_id('Instagram', sources['Instagram'], row) else: instagram_id = row['InstagramID'] if (instagram_id == ''): row['Status'] = 'Could not find InstagramID' else: df.loc[i, 'InstagramID'], n = run_ig_img_scraper( instagram_id, row['FoodieID']) if (n <= 1): row['Status'] = "Zero IG Imgs scraped" else: row['Status'] = 'Added IG Imgs' if "Doordash Images" in row['Status']: print("Running Doordash image scraper") if (isNaN(row['DoordashID'])): doordash_id = gather_id('DoorDash', sources['DoorDash'], row) else: doordash_id = row['DoordashID'] if (doordash_id == ''): row['Status'] = 'Could not find DoordashID' else: df.loc[i, 'DoordashID'] = doordash_id row['Status'] = scrape_doordash_images( doordash_id, row['FoodieID']) if "Yelp Images" in row['Status']: print("Running Yelp image scraper") if (isNaN(row['YelpID'])): yelp_id = gather_id('Yelp', sources['Yelp'], row) else: yelp_id = row['YelpID'] if (yelp_id == ''): row['Status'] = 'Could not find YelpID' else: df.loc[i, 'yelp_id'] = yelp_id output = pull_yelp_images(yelp_id, 100) n = analyze_yelp_images(output, row['FoodieID'], yelp_id) row['Status'] = "Scraped Yelp Images" + str(n) # print("Running Postmates image scraper") # if(isNaN(row['PostMatesID'])): # postmates_id = gather_id('PostMates', sources['PostMates'], row) # print("New PostMatesID: ", postmates_id) # else: # postmates_id = row['PostMatesID'] # print("Existing PostMates ID: ", postmates_id) # run_postmates_image_scraper(postmates_id, row['FoodieID']) # df.loc[i, 'PostMatesID'] = postmates_id # print("Running Doordash image scraper") # if(isNaN(row['DoordashID'])): # doordash_id = gather_id('Doordash', sources['Doordash'], row) # else: # doordash_id = row['DoordashID'] # scrape_doordash_images(doordash_id, row['FoodieID']) # df.loc[i, 'DoordashID'] = doordash_id if "Instagram Info" in row['Status']: run_ig_info_scraper(row['Hashtag'], 100) row['Status'] = row['Status'].replace('Instagram Info', '+IG Info') if "Pull Menu" in row['Status']: json = extract_json(row['FoodieID']) save_locally(json, row['FoodieID']) run_save_items_csv(row['FoodieID']) row['Status'] = "Pulled Menu from Database" if "Enter Info" in row['Status']: row = fill_in_data(row) df.loc[i] = row row['Status'] = "Entered Info from Database" if "User Comments General" in row['Status']: json = extract_json(row['FoodieID']) save_locally(json, row['FoodieID']) scrape_reviews(row, 150) run_match_sentences(row['FoodieID'], 4, 1) run_sentiment_analyzer(0, row['FoodieID'], '') row['Status'] = create_comments(row, 'General') if "User Comments DC" in row['Status']: json = extract_json(row['FoodieID']) save_locally(json, row['FoodieID']) scrape_reviews(row, 150) run_match_sentences(row['FoodieID'], 4, 1) run_sentiment_analyzer(0, row['FoodieID'], '') row['Status'] = create_comments(row, 'DC') if "FoursquareID" in row['Status']: row['FoursquareID'] = get_id(row['Name'], row['Address'], row['City'], row['State'], row['Zipcode']) print(row['FoursquareID']) df.loc[i, 'FoursquareID'] = row['FoursquareID'] if "GooglePlaceID" in row['Status']: row['GooglePlaceID'], row['Status'] = generate_google_api_key( row['Name'], row['Address'], row['City'], row['State'], row['Zipcode']) df.loc[i, 'GooglePlaceID'] = row['GooglePlaceID'] if "Delivery Link" in row['Status']: get_delivery_links(row) import_delivery_links(f_suffix) row['Status'] = 'Added Del. Link' if "Automated DMs" in row['Status']: if (isNaN(row['InstagramID'])): instagram_id = gather_id('Instagram', sources['Instagram'], row) else: instagram_id = row['InstagramID'] usernames_for_automated_dms(instagram_id, 300, row['Name'], row['City']) row['Status'] = 'Gathered Usernames' if "Doordash Menu" in row['Status']: if (isNaN(row['DoordashID'])): doordash_id = gather_id('DoorDash', sources['DoorDash'], row) row['DoordashID'] = doordash_id df.loc[i, 'DoordashID'] = doordash_id else: doordash_id = row['DoordashID'] if doordash_id == "": row['Status'] = "Could not find DoordashID" else: scrape_doordash_menu(doordash_id, row['FoodieID']) run_save_items_csv(row['FoodieID']) row['Status'] = 'Menu Scraped from Doordash' if "Grubhub Menu" in row['Status']: if (isNaN(row['GrubhubID'])): grubhub_id = gather_id('GrubHub', sources['GrubHub'], row) df.loc[i, 'GrubhubID'] = grubhub_id else: grubhub_id = row['GrubhubID'] if grubhub_id == "": row['Status'] = "Could not find GrubhubID" else: scrape_grubhub_menu( 'https://api-gtm.grubhub.com/restaurants/' + grubhub_id, 'Bearer d738ccb5-56fa-4618-849a-0862aac27a60', row['FoodieID']) run_save_items_csv(row['FoodieID']) row['Status'] = 'Menu Scraped from GrubHub' if "Caviar Menu" in row['Status']: if (isNaN(row['CaviarID'])): caviar_id = gather_id('Caviar', sources['Caviar'], row) df.loc[i, 'CaviarID'] = caviar_id else: caviar_id = row['CaviarID'] if caviar_id == "": row['Status'] = "Could not find CaviarID" else: scrape_caviar_menu(caviar_id, row['FoodieID']) run_save_items_csv(row['FoodieID']) row['Status'] = 'Menu Scraped from Caviar' if "Google Description" in row['Status']: query = row['Name'] + row['Address'] + row['City'] google_description = scrape_google_description(query) print(google_description) df.loc[i, 'Google Description'] = google_description row['Status'] = 'Scraped Description' if "Google Hours Spent" in row['Status']: query = row['Name'] + row['Address'] + row['City'] hours_spent = scrape_google_hours_spent(query) df.loc[i, 'Google Hours Spent'] = hours_spent row['Status'] = 'Scraped Hours Spent' if "Categories" in row['Status']: name, address, city, state, zipcode, yelpId = read_row(row) if (isNaN(row['YelpID'])): row['YelpID'] = gather_id('Yelp', sources['Yelp'], row) categories = gather_categories(name, address, city, state, zipcode, row['YelpID']) df.loc[i, "Categories"] = categories row['Status'] = 'Scraped Category' if "Yelp Neighborhood" in row['Status']: if (isNaN(row['YelpID'])): print("Gathering YelpID") row['YelpID'] = gather_id('Yelp', sources['Yelp'], row) print(row['YelpID']) info = scrap_data(row['YelpID'], "") df.loc[i, 'Yelp Neighborhood'] = info['Neighborhood'] row['Status'] = 'Scraped Neighborhood' if "Woflow Menu" in row['Status']: run_woflow_conversion(row['FoodieID'], row['GooglePlaceID'], row['Name']) run_save_items_csv(row['FoodieID']) run_save_items_pickle(row['FoodieID'], row['FoodieID']) run_save_items_csv(row['FoodieID']) row['Status'] = 'Converted Woflow' if "Infatuation Reviews" in row['Status']: review, date = scrape_infatuation_reviews(row['InfatuationID']) n = create_output(review, date, row['FoodieID']) if (n == 0): row['Status'] = "Zero Infatuation Rvs Scraped" else: row['Status'] = "Added Infatuation Rvs" + str(n) except Exception: traceback.print_exc() row['Status'] = "Script Error" df.loc[i, 'Status'] = row['Status'] df.loc[i, 'FoodieID'] = row['FoodieID'] df.loc[i, 'YelpID'] = row['YelpID'] save_output(df, f_suffix) try: zip_upwork_verify_menus(f_suffix) except: pass
def analyze_yelp_images(images_data, foodie_id, yelp_id): exceptions = [ 'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from' ] + foodie_id.split('-') script_dir = os.path.abspath(os.path.join(__file__, "../..")) # menu_items = read_items(script_dir, foodie_id) json = extract_json(foodie_id) menu_items = save_locally(json, foodie_id) path = script_dir + "/csvfiles/images/" + foodie_id[:100] + yelp_id.replace( '/', '-') + "-images-yelp/" try: os.makedirs(path) except OSError: pass # initializing foodie_ids = [] items = [] filenames = [] captions = [] source_ids = [] n = 100 for image_data in images_data: caption = image_data[0] link = image_data[1] # remove sentence if it does not contain a menu item items_in_given_sentence = items_in_sentence(caption, menu_items, 2, foodie_id, exceptions) print(items_in_given_sentence) if (len(items_in_given_sentence) == 0): continue # choose best item out of all matched items optimized_items = optimize_list(items_in_given_sentence, caption.lower()) for item in optimized_items: filename = foodie_id + "-" + str(n) + ".jpg" urlretrieve(link, path + filename) foodie_ids.append(foodie_id) items.append(item) filenames.append(filename) captions.append(caption) source_ids.append(yelp_id) n += 1 print(n) d = { 'FoodieID': foodie_ids, 'Item': items, 'Filename': filenames, 'Captions': captions, 'YelpID': source_ids } df = pd.DataFrame(d) df.to_excel(path + foodie_id[:100] + yelp_id.replace('/', '-') + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False) return n - 100
def run_chowbusimagescraper(chowbus_id, foodie_id): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--incognito") chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(options=chrome_options) wiki = "https://www.chowbus.com" + chowbus_id driver.get(wiki) #Logistics script_dir = os.path.abspath(os.path.join(__file__ , "../..")) path = script_dir + "/csvfiles/images/" + foodie_id + "-images-chowbus/" exceptions = ['a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'] + foodie_id.split('-') print(path) # menu_items = read_items(script_dir, foodie_id) json = extract_json(foodie_id) if(json['Items'] == []): return 'Could not pull images from database. Potential FoodieID mismatch.' menu_items = save_locally(json, foodie_id) foodie_ids = [] source_ids = [] items = [] filenames = [] matches = [] n = 0 elements = driver.find_elements_by_class_name('jss290') print(elements) print("Elements length", len(elements)) for element in elements: item_name = element.find_elements_by_class_name('jss290').get_attribute(getText()) matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions) if(len(matched_items) == 0): continue imgs = element.find_elements_by_class_name("jss326") for img in imgs: img_src = img.get_attribute("src") print(img_src) optimized_items = optimize_list(matched_items, item_name.lower()) print("the length of list is: ", len(optimized_items)) for item in optimized_items: if n == 0: try: os.makedirs(path) except OSError: pass filename = foodie_id + "-" + str(n) + ".jpg" urlretrieve(img_src, path + filename) print(filename) foodie_ids.append(foodie_id) items.append(item) filenames.append(filename) print("even more food") matches.append(item_name) n += 1 print(n) driver.close() if n > 0: d = {'FoodieID' : foodie_ids, 'Item' : items, 'Filename' : filenames, 'Matches' : matches} df = pd.DataFrame(d) df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False) return 'Added Chowbus Imgs' print(path)
def create_comments(row, user_type): foodie_id = row['FoodieID'] yelp_id = row['YelpID'] # set file paths to data inputs script_dir = os.path.abspath(os.path.join(__file__ ,"../..")) file_path = script_dir + "/csvfiles/sentences/labeled-rated/" + foodie_id + "-labeled-rated.xlsx" output_path = script_dir + "/csvfiles/sentences/comments/" + foodie_id + "-comments.xlsx" json = extract_json(foodie_id) df = pd.read_excel(file_path) info = json['Info'] df = df.drop(columns=['Keywords']) df = max_3(df) # df = df.drop_duplicates(subset=['Item'], keep="first") df['Source'] = 'Yelp' # if yelp_id == '': # df['SourceRestaurantCode'] = get_yelp_id(json) # else: # df['SourceRestaurantCode'] = yelp_id df['FoodieID'] = foodie_id df['Month'] = "" df['Day'] = "" df['Year'] = "" start = user_ranges[user_type]['start'] end = user_ranges[user_type]['end'] # df.loc[i, 'SourceUserCode'] = "user-" + user_code user_code = randint(start, end) + 1000 random_day = random_date() # df.loc[i, 'Month'] = random_day.month # df.loc[i, 'Day'] = random_day.day # df.loc[i, 'Year'] = random_day.year for i, row in df.iterrows(): print(row) if i % 5 == 0: user_code = randint(start, end) + 1000 if(user_code == 1104): user_code = 1105 random_day = random_date() if(row['Item'] in dish_user_match): while(user_code in dish_user_match[row['Item']]): user_code = randint(start, end) + 1000 dish_user_match[row['Item']].append(user_code) else: dish_user_match[row['Item']] = [] dish_user_match[row['Item']].append(user_code) df.loc[i, 'SourceUserCode'] = "user-" + str(user_code) df.loc[i, 'Month'] = random_day.month df.loc[i, 'Day'] = random_day.day df.loc[i, 'Year'] = random_day.year df.loc[i, 'Categories'], df.loc[i, 'Menu'], df.loc[i, 'Description'] = match_item(row['Item'], json) df = df[['Source', 'SourceUserCode', 'FoodieID', 'Month', 'Day', 'Year', 'Item', 'Sentence', 'Rating', 'Categories', 'Menu', 'Description']] df.to_excel(output_path, index=False) return "Comments Extracted"
def run_caviar_image_scraper(caviar_id, foodie_id): #Open browser in incognito chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--incognito") chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-gpu') driver = webdriver.Chrome(options=chrome_options) wiki = "https://www.trycaviar.com/" + caviar_id driver.get(wiki) #Logistics script_dir = os.path.abspath(os.path.join(__file__, "../..")) path = script_dir + "/csvfiles/images/" + foodie_id + "-images-caviar/" exceptions = [ 'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from' ] + foodie_id.split('-') # menu_items = read_items(script_dir, foodie_id) json = extract_json(foodie_id) if (json['Items'] == []): return 'Could not pull images from database. Potential FoodieID mismatch.' menu_items = save_locally(json, foodie_id) foodie_ids = [] source_ids = [] items = [] filenames = [] matches = [] n = 0 #Click on reviews dishes = driver.find_elements_by_xpath( "//a[@class='js-offer-link offer-tile_link']") dishes = dishes + driver.find_elements_by_xpath( "//a[@class='js-offer-link offer-tile_link offer-tile_link--unavailable']" ) dish_links = [] for dish in dishes: dish_link = dish.get_attribute("href") dish_links.append(dish_link) for dish_link in dish_links: driver.get(dish_link) item_name = driver.find_element_by_xpath( "//h1[@class='item_name']").text item_img_srcset = driver.find_elements_by_xpath( "//img[@class='item_image']") if (item_img_srcset == []): continue print(item_name) print(item_img_srcset) item_img_srcset = item_img_srcset[0].get_attribute("srcset").split() img_src = item_img_srcset[len(item_img_srcset) - 2] matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions) if (len(matched_items) == 0): continue optimized_items = optimize_list(matched_items, item_name.lower()) for item in optimized_items: if n == 0: try: os.makedirs(path) except OSError: pass filename = foodie_id + "-" + str(n) + ".jpg" urlretrieve(img_src, path + filename) foodie_ids.append(foodie_id) items.append(item) filenames.append(filename) matches.append(item_name) n += 1 print(n) driver.close() if n > 0: d = { 'FoodieID': foodie_ids, 'Item': items, 'Filename': filenames, 'Matches': matches } df = pd.DataFrame(d) df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False) return 'Added Caviar Imgs' else: return 'No Caviar Imgs Scraped'
def scrape_images(storeId, foodie_id): name = "doordash_menu" url = "https://api-consumer-client.doordash.com/graphql" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', 'Content-Type': 'application/json', 'Credentials': 'include' } body = { "operationName": "menu", "variables": { # "storeId": "360", # "storeId": "2693" "storeId": storeId # "menuId": "223199" }, "query":''' query menu($storeId: ID!, $menuId: ID) { storeInformation(storeId: $storeId) { id name description isGoodForGroupOrders offersPickup offersDelivery deliveryFee sosDeliveryFee numRatings averageRating shouldShowStoreLogo isConsumerSubscriptionEligible headerImgUrl coverImgUrl distanceFromConsumer providesExternalCourierTracking fulfillsOwnDeliveries isDeliverableToConsumerAddress priceRange business { id name __typename } address { street printableAddress lat lng city state __typename } status { asapAvailable scheduledAvailable asapMinutesRange asapPickupMinutesRange __typename } merchantPromotions { id minimumOrderCartSubtotal newStoreCustomersOnly deliveryFee __typename } storeDisclaimers { id disclaimerDetailsLink disclaimerLinkSubstring disclaimerText displayTreatment __typename } __typename } storeMenus(storeId: $storeId, menuId: $menuId) { allMenus { id name subtitle isBusinessEnabled timesOpen __typename } currentMenu { id timesOpen hoursToOrderInAdvance isCatering minOrderSize menuCategories { ...StoreMenuCategoryFragment items { ...StoreMenuListItemFragment __typename } __typename } __typename } __typename } storeCrossLinks(storeId: $storeId) { trendingStores { ...StoreCrossLinkItemFragment __typename } trendingCategories { ...StoreCrossLinkItemFragment __typename } topCuisinesNearMe { ...StoreCrossLinkItemFragment __typename } nearbyCities { ...StoreCrossLinkItemFragment __typename } __typename } } fragment StoreMenuCategoryFragment on StoreMenuCategory { id subtitle title __typename } fragment StoreMenuListItemFragment on StoreMenuListItem { id description isTempDeactivated price imageUrl name __typename } fragment StoreCrossLinkItemFragment on StoreCrossLinkItem { name url __typename } ''' } response = requests.post(url, cookies={'X-CSRFToken': 'MKp9Os0ao3HiPO9ybnSFdDy7HrrodcxiFOWVhuhjaHEybo28kCAfBwMOWp6b78BU'}, data = json.dumps(body), headers = headers) print(response) allMenus = response.json()['data']['storeMenus']['allMenus'] #Logistics script_dir = os.path.abspath(os.path.join(__file__ ,"../..")) path = script_dir + "/csvfiles/images/" + foodie_id + "-images-doordash/" exceptions = ['a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'] + foodie_id.split('-') # menu_items = read_items(script_dir, foodie_id) json_t = extract_json(foodie_id) if(json_t['Items'] == []): return 'Could not pull images from database. Potential FoodieID mismatch.' menu_items = save_locally(json_t, foodie_id) foodie_ids = [] source_ids = [] items = [] filenames = [] matches = [] n = 0 for menu in allMenus: menu_name = menu['subtitle'] # self.body['variables']['menuId'] = menu['id'] # print(menu['id']) body['variables']['menuId'] = menu['id'] response = requests.post(url, cookies={'X-CSRFToken': 'MKp9Os0ao3HiPO9ybnSFdDy7HrrodcxiFOWVhuhjaHEybo28kCAfBwMOWp6b78BU'}, data = json.dumps(body), headers = headers) for category in response.json()['data']['storeMenus']['currentMenu']['menuCategories']: for item in category['items']: if(item['imageUrl']): item_name = item['name'] img_url = item['imageUrl'] matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions) if(len(matched_items) == 0): continue optimized_items = optimize_list(matched_items, item_name.lower()) for item in optimized_items: if n == 0: try: os.makedirs(path) except OSError: pass filename = foodie_id + "-" + str(n) + ".jpg" urlretrieve(img_url, path + filename) img = cv2.imread(path + filename, cv2.IMREAD_UNCHANGED) # get dimensions of image dimensions = img.shape # height, width, number of channels in image height = img.shape[0] width = img.shape[1] if(height > 300 and width > 450): print(height, width) foodie_ids.append(foodie_id) items.append(item) filenames.append(filename) matches.append(item_name) n += 1 print(n) if n > 0: d = {'FoodieID' : foodie_ids, 'Item' : items, 'Filename' : filenames, 'Matches' : matches} df = pd.DataFrame(d) df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False) return 'Added Doordash Imgs' else: return 'Zero Doordash Imgs Scraped'
def pull_images(response_json, foodie_id): menu_items = {} menu_items['Food'] = [] #Logistics script_dir = os.path.abspath(os.path.join(__file__, "../..")) path = script_dir + "/csvfiles/images/" + foodie_id + "-images-grubhub/" exceptions = [ 'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from' ] + foodie_id.split('-') # menu_items = read_items(script_dir, foodie_id) json_t = extract_json(foodie_id) if (json_t['Items'] == []): return 'Could not pull images from database. Potential FoodieID mismatch.' menu_items = save_locally(json_t, foodie_id) foodie_ids = [] source_ids = [] items = [] filenames = [] matches = [] n = 0 print(response_json) categories = response_json['restaurant']['menu_category_list'] for category in categories: dishes = category['menu_item_list'] for dish in dishes: item_name = dish['name'] if 'media_image' in dish: img_url = dish['media_image']['base_url'] + dish['media_image'][ 'public_id'] + '.' + dish['media_image']['format'] print(img_url) matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions) if (len(matched_items) == 0): continue optimized_items = optimize_list(matched_items, item_name.lower()) for item in optimized_items: if n == 0: try: os.makedirs(path) except OSError: pass filename = foodie_id + "-" + str(n) + ".jpg" urlretrieve(img_url, path + filename) foodie_ids.append(foodie_id) items.append(item) filenames.append(filename) matches.append(item_name) n += 1 print(n) if n > 0: d = { 'FoodieID': foodie_ids, 'Item': items, 'Filename': filenames, 'Matches': matches } df = pd.DataFrame(d) df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False) return 'Added GrubHub Imgs'