Example #1
0
def scrape_doordash_images(doordash_code, foodie_id):
    print("DoordashID:", doordash_code)
    if (doordash_code == ""):
        return False

    soup = pull_doordash_html(doordash_code)

    clean_soup = soup.find_all('script')[8].get_text().split("\"")[1]
    clean_soup = clean_soup.replace("\\u0022", "\"").replace("\\u002D", "-")
    json_data = json.loads(clean_soup)
    path = script_dir + "/csvfiles/images/" + foodie_id + "-images-doordash/"
    try:
        os.makedirs(path)
    except OSError:
        pass
    exceptions = [
        'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'
    ] + foodie_id.split('-')
    json_menu = extract_json(foodie_id)
    menu_items = save_locally(json_menu, foodie_id)
    n = 0

    for category in json_data['current_menu']['menu_categories']:
        title = category['title']
        cat_items = category['items']
        for item in cat_items:
            image_name = item['name']
            image_url = item['image_url']
            if image_url == None:
                continue

            matched_items = items_in_sentence(image_name, menu_items, 2,
                                              foodie_id, exceptions)
            if (len(matched_items) == 0):
                continue

            optimized_items = optimize_list(matched_items, image_name.lower())
            print(optimized_items)
            for optimized_item in optimized_items:
                filename = foodie_id + "-" + str(n) + ".jpg"
                urlretrieve(image_url, path + filename)
                foodie_ids.append(foodie_id)
                items.append(optimized_item)
                filenames.append(filename)
                matches.append(image_name)
                source_ids.append(doordash_code)
                n += 1

    d = {
        'FoodieID': foodie_ids,
        'Item': items,
        'Filename': filenames,
        'Matches': matches,
        'DoordashID': source_ids
    }
    df = pd.DataFrame(d)
    df.to_excel(path + foodie_id + ".xlsx",
                sheet_name='Sheet1',
                encoding="utf8",
                index=False)
Example #2
0
def fill_in_data(row):
    json = extract_json(row['FoodieID'])['Info'][0]
    row['Name'] = json['Name']
    row['Address'] = json['AddressLine']
    row['City'] = json['City']
    row['State'] = json['State']
    row['Zipcode'] = json['ZipCode']
    return row
Example #3
0
def read_items(script_dir, restaurant_tag):
    filename = script_dir + "/output_menu_items/foodie/" + restaurant_tag + ".txt"
    if os.path.exists(filename):
        with open(filename, 'rb') as f:
            menu_items = pickle.load(f)
    else:
        json = extract_json(restaurant_tag)
        menu_items = save_locally(json, restaurant_tag)
    return menu_items
def create_output(review, date, foodie_id):
    output = []
    month, day, year = date_breakdown(date)

    # splits the text with \n
    split_text = review.split('\n')
    split_text = [elem.strip() for elem in split_text]
    
    # extracts json from database and save menu items locally on your machine
    json = extract_json(foodie_id)
    menu_items = save_locally(json, foodie_id)
    yelpid = get_yelp_id(json) # given json, we can extract yelp_id

    for item, description, price, category, menu, tag in menu_items['Food']: # iterating through each item for a restaurant
        matching = [split_text.index(s) for s in split_text if match(s, item)]

        if(matching != []): # checking if there is a match 
            # appending the paragraph that follows if there is an item
            best_word = best_match(matching, item, split_text)
            output.append(["Infatuation", "the-infatuation", yelpid, month, day, year, item, split_text[matching[0] + 1], '', category, menu, split_text[matching[0]]]) 

    # setting up DataFrame
    s = pd.DataFrame(output, index=None, columns=['Source', 'SourceUserCode', 'SourceRestaurantCode', 'Month', 'Day', 'Year', 'Item', 'Sentence', 'Rating', 'Categories', 'Menu', 'Item Match']) 

    # applying the sentiment analyzer
    # load_pretrained_models()
    s['Rating'] = predict(s['Sentence'])

    # dropping duplicates
    s = s.drop_duplicates(subset=['Sentence'], keep="first")

    s = convert_to_ten_scale(s)

    # saving Excel in csvfiles/comments
    script_dir = os.path.abspath(os.path.join(__file__ ,"../.."))
    s.to_excel(script_dir + "/csvfiles/sentences/comments/" + foodie_id + "-infatuation.xlsx", index=False)

    return len(s)
Example #5
0
def run_postmates_image_scraper(postmates_code, foodie_id):
    #Opening proper webpage
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--incognito")
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=chrome_options)

    wiki = "https://postmates.com/merchant/" + postmates_code
    driver.get(wiki)
    n = 0

    script_dir = os.path.abspath(os.path.join(__file__, "../.."))
    path = script_dir + "/csvfiles/images/" + foodie_id + "-images-postmates/"
    exceptions = [
        'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'
    ] + foodie_id.split('-')

    # menu_items = read_items(script_dir, foodie_id)
    json = extract_json(foodie_id)
    menu_items = save_locally(json, foodie_id)

    foodie_ids = []
    items = []
    filenames = []
    matches = []

    elements = driver.find_elements_by_xpath(
        "//div[@class='product-container css-2ko7m4 e1tw3vxs3']")
    for element in elements:
        item_name = element.find_element_by_xpath(
            ".//h3[@class='product-name css-1yjxguc e1tw3vxs4']"
        ).get_attribute("innerText")

        matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id,
                                          exceptions)
        if (len(matched_items) == 0):
            continue

        imgs = element.find_elements_by_xpath(
            ".//img[@class='css-1hyfx7x e1qfcze94']")
        for img in imgs:
            img_src = img.get_attribute("src")
            print(img_src)

            optimized_items = optimize_list(matched_items, item_name.lower())
            print(optimized_items)
            for item in optimized_items:

                if n == 0:
                    print("test")
                    try:
                        os.makedirs(path)
                    except OSError:
                        pass

                filename = foodie_id + "-" + str(n) + ".jpg"

                webp_finder = img_src.find('format=webp')
                print(webp_finder)
                img_src = img_src[:webp_finder]
                print(img_src)
                save_img_url(img_src, path + filename)

                foodie_ids.append(foodie_id)
                items.append(item)
                filenames.append(filename)
                matches.append(item_name)
                n += 1
                print(n)

    driver.close()

    if n > 0:
        d = {
            'FoodieID': foodie_ids,
            'Item': items,
            'Filename': filenames,
            'Matches': matches
        }
        df = pd.DataFrame(d)
        df.to_excel(path + foodie_id + ".xlsx",
                    sheet_name='Sheet1',
                    encoding="utf8",
                    index=False)
Example #6
0
def main(f_suffix):
    df = read_input(f_suffix)
    for i, row in df.iterrows():
        if (isNaN(row['FoodieID'])):
            row['FoodieID'] = create_foodie_id(row)
        try:
            initialize_sources()
            if (row['Status'] == "Imported"):
                continue
            if "Scrape Info" in row['Status']:
                scrape_info(row, f_suffix, row['Neighborhood'])
                row['Status'] = 'Completed Info Scrape'
            elif "Scrape Menu" in row['Status']:
                source, source_id = scrape_menu(row)
                scrape(row, i, f_suffix)
            if "New Request" in row['Status']:
                row = scrape(row, i, f_suffix)
            if "Menu Available" in row['Status']:
                row = scrape(row, i, f_suffix)
                if ("Manually add YelpID" not in row['Status']):
                    row['Status'] = match_analyze_compile(row)
            if "More Reviews" in row['Status']:
                row = scrape(row, i, f_suffix)
                if ("Manually add YelpID" not in row['Status']):
                    row['Status'] = match_analyze_compile(row)
            if "Generate PDF" in row['Status']:
                run_generate_analytics_pdf(row, row['FoodieID'])
            if "Ready to Import" in row['Status']:
                try:
                    add_to_database(row['FoodieID'])
                    row['Status'] = 'Imported'
                except:
                    row['Status'] = "Failed to Import"

            if "Caviar Images" in row['Status']:
                print("Running Caviar image scraper")
                if (isNaN(row['CaviarID'])):
                    caviar_id = gather_id('Caviar', sources['Caviar'], row)
                else:
                    caviar_id = row['CaviarID']

                if (caviar_id == ''):
                    row['Status'] = 'Could not find CaviarID'
                else:
                    row['Status'] = run_caviar_image_scraper(
                        caviar_id, row['FoodieID'])
                    df.loc[i, 'CaviarID'] = caviar_id
                    # row['Status'] = 'Added Caviar Imgs'

            if "Instagram Images" in row['Status']:
                print("Running Instagram image scraper")
                if (isNaN(row['InstagramID'])):
                    instagram_id = gather_id('Instagram', sources['Instagram'],
                                             row)
                else:
                    instagram_id = row['InstagramID']
                if (instagram_id == ''):
                    row['Status'] = 'Could not find InstagramID'
                else:
                    df.loc[i, 'InstagramID'], n = run_ig_img_scraper(
                        instagram_id, row['FoodieID'])
                    if (n <= 1):
                        row['Status'] = "Zero IG Imgs scraped"
                    else:
                        row['Status'] = 'Added IG Imgs'

            if "Doordash Images" in row['Status']:
                print("Running Doordash image scraper")
                if (isNaN(row['DoordashID'])):
                    doordash_id = gather_id('DoorDash', sources['DoorDash'],
                                            row)
                else:
                    doordash_id = row['DoordashID']
                if (doordash_id == ''):
                    row['Status'] = 'Could not find DoordashID'
                else:
                    df.loc[i, 'DoordashID'] = doordash_id
                    row['Status'] = scrape_doordash_images(
                        doordash_id, row['FoodieID'])

            if "Yelp Images" in row['Status']:
                print("Running Yelp image scraper")
                if (isNaN(row['YelpID'])):
                    yelp_id = gather_id('Yelp', sources['Yelp'], row)
                else:
                    yelp_id = row['YelpID']
                if (yelp_id == ''):
                    row['Status'] = 'Could not find YelpID'
                else:
                    df.loc[i, 'yelp_id'] = yelp_id
                    output = pull_yelp_images(yelp_id, 100)
                    n = analyze_yelp_images(output, row['FoodieID'], yelp_id)
                    row['Status'] = "Scraped Yelp Images" + str(n)

                # print("Running Postmates image scraper")
                # if(isNaN(row['PostMatesID'])):
                # 	postmates_id = gather_id('PostMates', sources['PostMates'], row)
                # 	print("New PostMatesID: ", postmates_id)
                # else:
                # 	postmates_id = row['PostMatesID']
                # 	print("Existing PostMates ID: ", postmates_id)
                # run_postmates_image_scraper(postmates_id, row['FoodieID'])
                # df.loc[i, 'PostMatesID'] = postmates_id

                # print("Running Doordash image scraper")
                # if(isNaN(row['DoordashID'])):
                # 	doordash_id = gather_id('Doordash', sources['Doordash'], row)
                # else:
                # 	doordash_id = row['DoordashID']
                # scrape_doordash_images(doordash_id, row['FoodieID'])
                # df.loc[i, 'DoordashID'] = doordash_id

            if "Instagram Info" in row['Status']:
                run_ig_info_scraper(row['Hashtag'], 100)
                row['Status'] = row['Status'].replace('Instagram Info',
                                                      '+IG Info')
            if "Pull Menu" in row['Status']:
                json = extract_json(row['FoodieID'])
                save_locally(json, row['FoodieID'])
                run_save_items_csv(row['FoodieID'])
                row['Status'] = "Pulled Menu from Database"
            if "Enter Info" in row['Status']:
                row = fill_in_data(row)
                df.loc[i] = row
                row['Status'] = "Entered Info from Database"
            if "User Comments General" in row['Status']:
                json = extract_json(row['FoodieID'])
                save_locally(json, row['FoodieID'])
                scrape_reviews(row, 150)
                run_match_sentences(row['FoodieID'], 4, 1)
                run_sentiment_analyzer(0, row['FoodieID'], '')
                row['Status'] = create_comments(row, 'General')
            if "User Comments DC" in row['Status']:
                json = extract_json(row['FoodieID'])
                save_locally(json, row['FoodieID'])
                scrape_reviews(row, 150)
                run_match_sentences(row['FoodieID'], 4, 1)
                run_sentiment_analyzer(0, row['FoodieID'], '')
                row['Status'] = create_comments(row, 'DC')
            if "FoursquareID" in row['Status']:
                row['FoursquareID'] = get_id(row['Name'], row['Address'],
                                             row['City'], row['State'],
                                             row['Zipcode'])
                print(row['FoursquareID'])
                df.loc[i, 'FoursquareID'] = row['FoursquareID']
            if "GooglePlaceID" in row['Status']:
                row['GooglePlaceID'], row['Status'] = generate_google_api_key(
                    row['Name'], row['Address'], row['City'], row['State'],
                    row['Zipcode'])
                df.loc[i, 'GooglePlaceID'] = row['GooglePlaceID']
            if "Delivery Link" in row['Status']:
                get_delivery_links(row)
                import_delivery_links(f_suffix)
                row['Status'] = 'Added Del. Link'
            if "Automated DMs" in row['Status']:
                if (isNaN(row['InstagramID'])):
                    instagram_id = gather_id('Instagram', sources['Instagram'],
                                             row)
                else:
                    instagram_id = row['InstagramID']
                usernames_for_automated_dms(instagram_id, 300, row['Name'],
                                            row['City'])
                row['Status'] = 'Gathered Usernames'

            if "Doordash Menu" in row['Status']:
                if (isNaN(row['DoordashID'])):
                    doordash_id = gather_id('DoorDash', sources['DoorDash'],
                                            row)
                    row['DoordashID'] = doordash_id
                    df.loc[i, 'DoordashID'] = doordash_id
                else:
                    doordash_id = row['DoordashID']
                if doordash_id == "":
                    row['Status'] = "Could not find DoordashID"
                else:
                    scrape_doordash_menu(doordash_id, row['FoodieID'])
                    run_save_items_csv(row['FoodieID'])
                    row['Status'] = 'Menu Scraped from Doordash'

            if "Grubhub Menu" in row['Status']:
                if (isNaN(row['GrubhubID'])):
                    grubhub_id = gather_id('GrubHub', sources['GrubHub'], row)
                    df.loc[i, 'GrubhubID'] = grubhub_id
                else:
                    grubhub_id = row['GrubhubID']
                if grubhub_id == "":
                    row['Status'] = "Could not find GrubhubID"
                else:
                    scrape_grubhub_menu(
                        'https://api-gtm.grubhub.com/restaurants/' +
                        grubhub_id,
                        'Bearer d738ccb5-56fa-4618-849a-0862aac27a60',
                        row['FoodieID'])
                    run_save_items_csv(row['FoodieID'])
                    row['Status'] = 'Menu Scraped from GrubHub'

            if "Caviar Menu" in row['Status']:
                if (isNaN(row['CaviarID'])):
                    caviar_id = gather_id('Caviar', sources['Caviar'], row)
                    df.loc[i, 'CaviarID'] = caviar_id
                else:
                    caviar_id = row['CaviarID']
                if caviar_id == "":
                    row['Status'] = "Could not find CaviarID"
                else:
                    scrape_caviar_menu(caviar_id, row['FoodieID'])
                    run_save_items_csv(row['FoodieID'])
                    row['Status'] = 'Menu Scraped from Caviar'

            if "Google Description" in row['Status']:
                query = row['Name'] + row['Address'] + row['City']
                google_description = scrape_google_description(query)
                print(google_description)
                df.loc[i, 'Google Description'] = google_description
                row['Status'] = 'Scraped Description'
            if "Google Hours Spent" in row['Status']:
                query = row['Name'] + row['Address'] + row['City']
                hours_spent = scrape_google_hours_spent(query)
                df.loc[i, 'Google Hours Spent'] = hours_spent
                row['Status'] = 'Scraped Hours Spent'
            if "Categories" in row['Status']:
                name, address, city, state, zipcode, yelpId = read_row(row)
                if (isNaN(row['YelpID'])):
                    row['YelpID'] = gather_id('Yelp', sources['Yelp'], row)
                categories = gather_categories(name, address, city, state,
                                               zipcode, row['YelpID'])
                df.loc[i, "Categories"] = categories
                row['Status'] = 'Scraped Category'
            if "Yelp Neighborhood" in row['Status']:
                if (isNaN(row['YelpID'])):
                    print("Gathering YelpID")
                    row['YelpID'] = gather_id('Yelp', sources['Yelp'], row)
                print(row['YelpID'])
                info = scrap_data(row['YelpID'], "")
                df.loc[i, 'Yelp Neighborhood'] = info['Neighborhood']
                row['Status'] = 'Scraped Neighborhood'
            if "Woflow Menu" in row['Status']:
                run_woflow_conversion(row['FoodieID'], row['GooglePlaceID'],
                                      row['Name'])
                run_save_items_csv(row['FoodieID'])
                run_save_items_pickle(row['FoodieID'], row['FoodieID'])
                run_save_items_csv(row['FoodieID'])
                row['Status'] = 'Converted Woflow'
            if "Infatuation Reviews" in row['Status']:
                review, date = scrape_infatuation_reviews(row['InfatuationID'])
                n = create_output(review, date, row['FoodieID'])
                if (n == 0):
                    row['Status'] = "Zero Infatuation Rvs Scraped"
                else:
                    row['Status'] = "Added Infatuation Rvs" + str(n)

        except Exception:
            traceback.print_exc()
            row['Status'] = "Script Error"
        df.loc[i, 'Status'] = row['Status']
        df.loc[i, 'FoodieID'] = row['FoodieID']
        df.loc[i, 'YelpID'] = row['YelpID']
        save_output(df, f_suffix)
    try:
        zip_upwork_verify_menus(f_suffix)
    except:
        pass
Example #7
0
def analyze_yelp_images(images_data, foodie_id, yelp_id):
    exceptions = [
        'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'
    ] + foodie_id.split('-')

    script_dir = os.path.abspath(os.path.join(__file__, "../.."))

    # menu_items = read_items(script_dir, foodie_id)
    json = extract_json(foodie_id)
    menu_items = save_locally(json, foodie_id)

    path = script_dir + "/csvfiles/images/" + foodie_id[:100] + yelp_id.replace(
        '/', '-') + "-images-yelp/"
    try:
        os.makedirs(path)
    except OSError:
        pass

    # initializing
    foodie_ids = []
    items = []
    filenames = []
    captions = []
    source_ids = []

    n = 100

    for image_data in images_data:
        caption = image_data[0]
        link = image_data[1]

        # remove sentence if it does not contain a menu item
        items_in_given_sentence = items_in_sentence(caption, menu_items, 2,
                                                    foodie_id, exceptions)
        print(items_in_given_sentence)
        if (len(items_in_given_sentence) == 0):
            continue

        # choose best item out of all matched items
        optimized_items = optimize_list(items_in_given_sentence,
                                        caption.lower())
        for item in optimized_items:

            filename = foodie_id + "-" + str(n) + ".jpg"
            urlretrieve(link, path + filename)

            foodie_ids.append(foodie_id)
            items.append(item)
            filenames.append(filename)
            captions.append(caption)
            source_ids.append(yelp_id)
            n += 1
            print(n)

    d = {
        'FoodieID': foodie_ids,
        'Item': items,
        'Filename': filenames,
        'Captions': captions,
        'YelpID': source_ids
    }
    df = pd.DataFrame(d)
    df.to_excel(path + foodie_id[:100] + yelp_id.replace('/', '-') + ".xlsx",
                sheet_name='Sheet1',
                encoding="utf8",
                index=False)

    return n - 100
Example #8
0
def run_chowbusimagescraper(chowbus_id, foodie_id):
	chrome_options = webdriver.ChromeOptions()
	chrome_options.add_argument("--incognito")
	chrome_options.add_argument('--headless')
	chrome_options.add_argument('--no-sandbox')
	chrome_options.add_argument('--disable-gpu')
	driver = webdriver.Chrome(options=chrome_options)

	wiki = "https://www.chowbus.com" + chowbus_id
	driver.get(wiki)
 
#Logistics
	script_dir = os.path.abspath(os.path.join(__file__ , "../.."))
	path = script_dir + "/csvfiles/images/" + foodie_id + "-images-chowbus/"
	exceptions = ['a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'] + foodie_id.split('-')
	print(path)


# menu_items = read_items(script_dir, foodie_id)
	json = extract_json(foodie_id)
	if(json['Items'] == []):
		return 'Could not pull images from database. Potential FoodieID mismatch.'
	menu_items = save_locally(json, foodie_id)

	foodie_ids = []
	source_ids = []
	items = []
	filenames = []
	matches = []
	n = 0


	elements = driver.find_elements_by_class_name('jss290')
	print(elements)
	print("Elements length", len(elements))
	for element in elements:
		item_name = element.find_elements_by_class_name('jss290').get_attribute(getText())

		matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions)
		if(len(matched_items) == 0):
			continue

		imgs = element.find_elements_by_class_name("jss326")
		for img in imgs:
			img_src = img.get_attribute("src")
			print(img_src)
			
			
		
			optimized_items = optimize_list(matched_items, item_name.lower())
			print("the length of list is: ", len(optimized_items))
			for item in optimized_items:
				if n == 0:
					try: os.makedirs(path)
					except OSError: pass
	
			filename = foodie_id + "-" + str(n) + ".jpg"
			urlretrieve(img_src, path + filename)
			print(filename)

			foodie_ids.append(foodie_id)
			items.append(item)
			filenames.append(filename)
			print("even more food")
			matches.append(item_name)
			n += 1 
			print(n)

	driver.close()
	if n > 0:
		d = {'FoodieID' : foodie_ids, 'Item' : items, 'Filename' : filenames, 'Matches' : matches}
		df = pd.DataFrame(d)
		df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False)
	return 'Added Chowbus Imgs'
	print(path)
Example #9
0
def create_comments(row, user_type):
	foodie_id = row['FoodieID']
	yelp_id = row['YelpID']
	# set file paths to data inputs
	script_dir = os.path.abspath(os.path.join(__file__ ,"../.."))
	file_path = script_dir + "/csvfiles/sentences/labeled-rated/" + foodie_id + "-labeled-rated.xlsx"
	output_path = script_dir + "/csvfiles/sentences/comments/" + foodie_id + "-comments.xlsx"

	json = extract_json(foodie_id)

	df = pd.read_excel(file_path)

	info = json['Info']

	df = df.drop(columns=['Keywords'])
	df = max_3(df)

	# df = df.drop_duplicates(subset=['Item'], keep="first")

	df['Source'] = 'Yelp'
	# if yelp_id == '':
	# 	df['SourceRestaurantCode'] = get_yelp_id(json)
	# else:
	# 	df['SourceRestaurantCode'] = yelp_id
	df['FoodieID'] = foodie_id
	df['Month'] = ""
	df['Day'] = ""
	df['Year'] = ""
	start = user_ranges[user_type]['start']
	end = user_ranges[user_type]['end']

	# df.loc[i, 'SourceUserCode'] = "user-" + user_code
	user_code = randint(start, end) + 1000		
	random_day = random_date()
	# df.loc[i, 'Month'] = random_day.month
	# df.loc[i, 'Day'] = random_day.day
	# df.loc[i, 'Year'] = random_day.year

	for i, row in df.iterrows():
		print(row)
		if i % 5 == 0:
			user_code = randint(start, end) + 1000
			if(user_code == 1104):
				user_code = 1105
			random_day = random_date()
		if(row['Item'] in dish_user_match):
			while(user_code in dish_user_match[row['Item']]):
				user_code = randint(start, end) + 1000
			dish_user_match[row['Item']].append(user_code)
		else:
			dish_user_match[row['Item']] = []
			dish_user_match[row['Item']].append(user_code)
		df.loc[i, 'SourceUserCode'] = "user-" + str(user_code)
		df.loc[i, 'Month'] = random_day.month
		df.loc[i, 'Day'] = random_day.day
		df.loc[i, 'Year'] = random_day.year
		df.loc[i, 'Categories'], df.loc[i, 'Menu'], df.loc[i, 'Description'] = match_item(row['Item'], json)

	df = df[['Source', 'SourceUserCode', 'FoodieID', 'Month', 'Day', 'Year', 'Item', 'Sentence', 'Rating', 'Categories', 'Menu', 'Description']]



	df.to_excel(output_path, index=False)	
	return "Comments Extracted"
Example #10
0
def run_caviar_image_scraper(caviar_id, foodie_id):
    #Open browser in incognito
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--incognito")
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=chrome_options)

    wiki = "https://www.trycaviar.com/" + caviar_id
    driver.get(wiki)

    #Logistics
    script_dir = os.path.abspath(os.path.join(__file__, "../.."))
    path = script_dir + "/csvfiles/images/" + foodie_id + "-images-caviar/"
    exceptions = [
        'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'
    ] + foodie_id.split('-')

    # menu_items = read_items(script_dir, foodie_id)
    json = extract_json(foodie_id)
    if (json['Items'] == []):
        return 'Could not pull images from database. Potential FoodieID mismatch.'
    menu_items = save_locally(json, foodie_id)

    foodie_ids = []
    source_ids = []
    items = []
    filenames = []
    matches = []
    n = 0

    #Click on reviews
    dishes = driver.find_elements_by_xpath(
        "//a[@class='js-offer-link offer-tile_link']")
    dishes = dishes + driver.find_elements_by_xpath(
        "//a[@class='js-offer-link offer-tile_link offer-tile_link--unavailable']"
    )
    dish_links = []
    for dish in dishes:
        dish_link = dish.get_attribute("href")
        dish_links.append(dish_link)

    for dish_link in dish_links:
        driver.get(dish_link)
        item_name = driver.find_element_by_xpath(
            "//h1[@class='item_name']").text
        item_img_srcset = driver.find_elements_by_xpath(
            "//img[@class='item_image']")
        if (item_img_srcset == []):
            continue

        print(item_name)
        print(item_img_srcset)

        item_img_srcset = item_img_srcset[0].get_attribute("srcset").split()
        img_src = item_img_srcset[len(item_img_srcset) - 2]

        matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id,
                                          exceptions)
        if (len(matched_items) == 0):
            continue

        optimized_items = optimize_list(matched_items, item_name.lower())
        for item in optimized_items:
            if n == 0:
                try:
                    os.makedirs(path)
                except OSError:
                    pass

            filename = foodie_id + "-" + str(n) + ".jpg"
            urlretrieve(img_src, path + filename)

            foodie_ids.append(foodie_id)
            items.append(item)
            filenames.append(filename)

            matches.append(item_name)
            n += 1
            print(n)

    driver.close()
    if n > 0:
        d = {
            'FoodieID': foodie_ids,
            'Item': items,
            'Filename': filenames,
            'Matches': matches
        }
        df = pd.DataFrame(d)
        df.to_excel(path + foodie_id + ".xlsx",
                    sheet_name='Sheet1',
                    encoding="utf8",
                    index=False)
        return 'Added Caviar Imgs'
    else:
        return 'No Caviar Imgs Scraped'
Example #11
0
def scrape_images(storeId, foodie_id):
	name = "doordash_menu"
	url = "https://api-consumer-client.doordash.com/graphql"

	headers = {
	     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
	     'Content-Type': 'application/json',
	     'Credentials': 'include'
	}
	body = {
	    "operationName": "menu",
	    "variables": {
	        # "storeId": "360",
	        # "storeId": "2693"
	        "storeId": storeId
	        # "menuId": "223199"
	    },
	    "query":'''
	          query menu($storeId: ID!, $menuId: ID) {
	            storeInformation(storeId: $storeId) {
	              id
	              name
	              description
	              isGoodForGroupOrders
	              offersPickup
	              offersDelivery
	              deliveryFee
	              sosDeliveryFee
	              numRatings
	              averageRating
	              shouldShowStoreLogo
	              isConsumerSubscriptionEligible
	              headerImgUrl
	              coverImgUrl
	              distanceFromConsumer
	              providesExternalCourierTracking
	              fulfillsOwnDeliveries
	              isDeliverableToConsumerAddress
	              priceRange
	              business {
	                id
	                name
	                __typename
	              }
	              address {
	                street
	                printableAddress
	                lat
	                lng
	                city
	                state
	                __typename
	              }
	              status {
	                asapAvailable
	                scheduledAvailable
	                asapMinutesRange
	                asapPickupMinutesRange
	                __typename
	              }
	              merchantPromotions {
	                id
	                minimumOrderCartSubtotal
	                newStoreCustomersOnly
	                deliveryFee
	                __typename
	              }
	              storeDisclaimers {
	                id
	                disclaimerDetailsLink
	                disclaimerLinkSubstring
	                disclaimerText
	                displayTreatment
	                __typename
	              }
	              __typename
	            }
	            storeMenus(storeId: $storeId, menuId: $menuId) {
	              allMenus {
	                id
	                name
	                subtitle
	                isBusinessEnabled
	                timesOpen
	                __typename
	              }
	              currentMenu {
	                id
	                timesOpen
	                hoursToOrderInAdvance
	                isCatering
	                minOrderSize
	                menuCategories {
	                  ...StoreMenuCategoryFragment
	                  items {
	                    ...StoreMenuListItemFragment
	                    __typename
	                  }
	                  __typename
	                }
	                __typename
	              }
	              __typename
	            }
	            storeCrossLinks(storeId: $storeId) {
	              trendingStores {
	                ...StoreCrossLinkItemFragment
	                __typename
	              }
	              trendingCategories {
	                ...StoreCrossLinkItemFragment
	                __typename
	              }
	              topCuisinesNearMe {
	                ...StoreCrossLinkItemFragment
	                __typename
	              }
	              nearbyCities {
	                ...StoreCrossLinkItemFragment
	                __typename
	              }
	              __typename
	            }
	          }

	        fragment StoreMenuCategoryFragment on StoreMenuCategory {
	          id
	          subtitle
	          title
	          __typename
	        }

	        fragment StoreMenuListItemFragment on StoreMenuListItem {
	          id
	          description
	          isTempDeactivated
	          price
	          imageUrl
	          name
	          __typename
	        }

	        fragment StoreCrossLinkItemFragment on StoreCrossLinkItem {
	          name
	          url
	          __typename
	        }

	    '''
	}
	response = requests.post(url, cookies={'X-CSRFToken': 'MKp9Os0ao3HiPO9ybnSFdDy7HrrodcxiFOWVhuhjaHEybo28kCAfBwMOWp6b78BU'}, data = json.dumps(body), headers = headers)
	print(response)
	allMenus = response.json()['data']['storeMenus']['allMenus']

	#Logistics
	script_dir = os.path.abspath(os.path.join(__file__ ,"../.."))
	path = script_dir + "/csvfiles/images/" + foodie_id + "-images-doordash/"
	exceptions = ['a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'] + foodie_id.split('-')

	# menu_items = read_items(script_dir, foodie_id)
	json_t = extract_json(foodie_id)
	if(json_t['Items'] == []):
		return 'Could not pull images from database. Potential FoodieID mismatch.'
	menu_items = save_locally(json_t, foodie_id)
	
	foodie_ids = []
	source_ids = []
	items = []
	filenames = []
	matches = []
	n = 0

	for menu in allMenus:
		menu_name = menu['subtitle']
		
		# self.body['variables']['menuId'] = menu['id']
		# print(menu['id'])
		body['variables']['menuId'] = menu['id']
		response = requests.post(url, cookies={'X-CSRFToken': 'MKp9Os0ao3HiPO9ybnSFdDy7HrrodcxiFOWVhuhjaHEybo28kCAfBwMOWp6b78BU'}, data = json.dumps(body), headers = headers)
		for category in response.json()['data']['storeMenus']['currentMenu']['menuCategories']:
			for item in category['items']:
				if(item['imageUrl']):
					item_name = item['name']
					img_url = item['imageUrl']

					matched_items = items_in_sentence(item_name, menu_items, 2, foodie_id, exceptions)
					if(len(matched_items) == 0):
						continue     

					optimized_items = optimize_list(matched_items, item_name.lower())
					for item in optimized_items:
						if n == 0:
							try: os.makedirs(path)
							except OSError: pass

						filename = foodie_id + "-" + str(n) + ".jpg"
						urlretrieve(img_url, path + filename)

						img = cv2.imread(path + filename, cv2.IMREAD_UNCHANGED)

						# get dimensions of image
						dimensions = img.shape

						# height, width, number of channels in image
						height = img.shape[0]
						width = img.shape[1]
						if(height > 300 and width > 450):
							print(height, width)
							foodie_ids.append(foodie_id)
							items.append(item)
							filenames.append(filename)

							matches.append(item_name)
							n += 1
							print(n)

	if n > 0:
		d = {'FoodieID' : foodie_ids, 'Item' : items, 'Filename' : filenames, 'Matches' : matches}
		df = pd.DataFrame(d)
		df.to_excel(path + foodie_id + ".xlsx", sheet_name='Sheet1', encoding="utf8", index=False)
		return 'Added Doordash Imgs'
	else:
		return 'Zero Doordash Imgs Scraped'
Example #12
0
def pull_images(response_json, foodie_id):
    menu_items = {}
    menu_items['Food'] = []

    #Logistics
    script_dir = os.path.abspath(os.path.join(__file__, "../.."))
    path = script_dir + "/csvfiles/images/" + foodie_id + "-images-grubhub/"
    exceptions = [
        'a', 'an', 'of', 'the', 'is', 'with', 'or', 'and', 'to', 'from'
    ] + foodie_id.split('-')

    # menu_items = read_items(script_dir, foodie_id)
    json_t = extract_json(foodie_id)
    if (json_t['Items'] == []):
        return 'Could not pull images from database. Potential FoodieID mismatch.'
    menu_items = save_locally(json_t, foodie_id)

    foodie_ids = []
    source_ids = []
    items = []
    filenames = []
    matches = []
    n = 0

    print(response_json)
    categories = response_json['restaurant']['menu_category_list']
    for category in categories:
        dishes = category['menu_item_list']
        for dish in dishes:
            item_name = dish['name']
            if 'media_image' in dish:
                img_url = dish['media_image']['base_url'] + dish['media_image'][
                    'public_id'] + '.' + dish['media_image']['format']
                print(img_url)
                matched_items = items_in_sentence(item_name, menu_items, 2,
                                                  foodie_id, exceptions)
                if (len(matched_items) == 0):
                    continue

                optimized_items = optimize_list(matched_items,
                                                item_name.lower())
                for item in optimized_items:
                    if n == 0:
                        try:
                            os.makedirs(path)
                        except OSError:
                            pass

                    filename = foodie_id + "-" + str(n) + ".jpg"
                    urlretrieve(img_url, path + filename)

                    foodie_ids.append(foodie_id)
                    items.append(item)
                    filenames.append(filename)

                    matches.append(item_name)
                    n += 1
                    print(n)

    if n > 0:
        d = {
            'FoodieID': foodie_ids,
            'Item': items,
            'Filename': filenames,
            'Matches': matches
        }
        df = pd.DataFrame(d)
        df.to_excel(path + foodie_id + ".xlsx",
                    sheet_name='Sheet1',
                    encoding="utf8",
                    index=False)

    return 'Added GrubHub Imgs'