def dowload_matting_dataset(output_dir): if not os.path.isdir(output_dir): os.makedirs(output_dir) response = google_images_download.googleimagesdownload() response.download({ "keywords": "portrait transparent background", "color_type": "transparent", "size": "medium", "limit": 500, "output_directory": output_dir, "chromedriver": "/usr/local/bin/chromedriver"}) response = google_images_download.googleimagesdownload() response.download({ "keywords": "texture background", "color_type": "full-color", "size": "medium", "limit": 500, "output_directory": output_dir, "chromedriver": "/usr/local/bin/chromedriver"})
# -*- coding: utf-8 -*- """ dl.py A module to search google and download images corresponding to search terms. From: https://github.com/hardikvasa/google-images-download """ from google_images_download import google_images_download response = google_images_download.googleimagesdownload() args = ['french identity card'] def run(): for arg in args: absolute_image_paths = response.download({ 'keywords': arg, 'limit': 200, # Requires `chromedriver` for more than 100 image scrapes. # To download: https://sites.google.com/a/chromium.org/chromedriver/downloads (link live 8/30/18) 'chromedriver': 'C:/apps/chromedriver/chromedriver.exe', 'proxy': 'fr-proxy.groupinfra.com:3128' })
def downloadFiles(name): response = google_images_download.googleimagesdownload() # class instantiation arguments = {"keywords": name, "limit": 50, "print_urls": True, 'usage_rights': "labeled-for-reuse-with-modifications", "format": 'jpg'} # creating list of arguments paths = response.download(arguments) # passing the arguments to the function
from google_images_download import google_images_download # * livrarias import sys # * argumentos necessários busca = input("Termo a pesquisar: ") limit = input("quantidade a pesquisar: ") formato = input("formato desejado: (jpg, gif, png, bmp, svg, webp, ico, raw) ") response = google_images_download.googleimagesdownload( ) # * instanciação da classe # * argumentos arguments = { "keywords": busca, "limit": limit, "print_urls": True, "delay": 1, "output_directory": "imagens", "prefix": busca, "format": formato } paths = response.download(arguments) # * passandos os argumentos para a função print(paths)
def stuff(keyword): ColorList = [] file = "cache/" + keyword.lower() storageClient = storage.Client() bucket = storageClient.get_bucket('askpalette.appspot.com') blob = bucket.blob(file) exists = storage.Blob(bucket=bucket, name=file).exists(storageClient) if exists: file = blob.download_as_string() if file: lines = file.decode().splitlines() for line in lines: list = line.split(",") ColorList.append(Color(list[0], list[1], list[2], list[3])) return ColorList try: response = google_images_download.googleimagesdownload( ) #class instantiation arguments = { "keywords": keyword, "limit": 3, "silent_mode": True, "no_numbering": True, "no_download": True } #arguments = {"keywords":keyword,"limit":5,"no_numbering":True} #creating list of arguments paths = response.download( arguments) #passing the arguments to the function while paths[0][keyword].__len__() < 1: paths = response.download(arguments) for uri in paths[0][keyword]: # Loads the image into memory if (__tooBig(uri)): continue print(uri) client = vision.ImageAnnotatorClient() image = vision.types.Image() image.source.image_uri = uri response = client.image_properties(image=image) props = response.image_properties_annotation #print('Properties:') #print(response) for colorData in props.dominant_colors.colors: color = Color(colorData.color.red, colorData.color.green, colorData.color.blue, colorData.score) merged = False for existing in ColorList: if existing.diff(color) < 10: existing = __average(existing, color) merged = True break if not merged: ColorList.append(color) ColorList2 = ColorList.copy() for existing in ColorList: for existing2 in ColorList2: threshold = 18 if ColorList.__len__() < 7: threshold = 5 if existing != existing2 and existing.diff( existing2) < threshold: existing = __average(existing, existing2) ColorList.remove(existing2) ColorList2.remove(existing2) ColorList.sort(key=lambda color: color.score, reverse=True) uploadStr = "" for color in ColorList: uploadStr += (color.__str__() + '\n') storageClient = storage.Client() bucket = storageClient.get_bucket('askpalette.appspot.com') blob = bucket.blob(file) blob.upload_from_string(uploadStr) except: return None return ColorList #stuff("music") # print(Color(6,17,71,0).diff(Color(6,9,36,1)))
def googleimage_download(dictionary): global Dict response = google_images_download.googleimagesdownload() arguments = Dict paths = response.download(arguments)
from google_images_download import google_images_download downloader = google_images_download.googleimagesdownload() # Download images. Unplash is a an image website, it serve as random image keyword. downloader.download( { "keywords": "croissant,pain au chocolat,unsplash", "output_directory": "dataset", "limit": 500, "chromedriver": "/home/pierre/Development/Devoxx/python/venv/lib/python3.6/site-packages/chromedriver_binary/chromedriver" } )
def the_function(the_name): the_anime=the_name #main window window = Tk() window.title("Anime Synopsis \n") window.configure(background="black") window.geometry("1280x720") #scraping function def scrapeit(name_anime): anime_name=name_anime name_update=name_anime.replace (" ","%20") search_string_1="https://myanimelist.net/anime.php?q=" final_search=search_string_1+name_update #print(final_search) search_init=requests.get(final_search) search_soup=BeautifulSoup(search_init.text,'lxml') the_final_link="" for link in search_soup.find_all('a', attrs={'class' : 'hoverinfo_trigger fw-b fl-l'},limit=1): the_final_link=link['href'] return the_final_link def defineit(the_final_link): res=requests.get(the_final_link) #res_content=res.content soup=BeautifulSoup(res.text,"lxml") soup_re=soup.find_all("span", itemprop="description") #print(soup.title.string) for i in soup_re: return(i.text) def stateit(the_anime_link): res=requests.get(the_anime_link) soup=BeautifulSoup(res.text,"lxml") soup_re=soup.findAll("div",{ "id" : "content" }) for i in soup_re: trash=i.text def Convert(string): li=list(string.split("\n")) return li list_re=Convert(trash) indexx=list_re.index("Status:",30) status_1=str(list_re[indexx+1]) return status_1 #background response = google_images_download.googleimagesdownload() #class instantiation arguments = {"keywords":str(the_name),"limit":1,"format":"jpg","size":"icon","silent_mode":1} #creating list of arguments paths = response.download(arguments) path_1=str(paths[0].get(str(the_name))) path_2=path_1.replace('[','') path_3=path_2.replace(']','') path_4=path_3.replace("'",'') #picture_1=PhotoImage(file=path_4) img = ImageTk.PhotoImage(Image.open(path_4)) Label(window, image=img, bg="black").grid(row=2,column=0,sticky=N+S+E+W) #label2 Label(window,text="Synopsis:\n", bg="black", fg="white", font="none 12 bold").grid(row=0, column=0,sticky=N+S+E+W) #output text output=tkscrolled.ScrolledText(window,width=100, height=6, wrap=WORD, background="white") output.grid(row=1, column=0, columnspan=3,sticky=N+S+E+W) output.delete(0.0, END) Label(window,text="\nStatus:",bg="black",fg="white",font="none 12 bold").grid(row=4,column=0,sticky=W) output_status=Text(window,width=50, height=2, wrap=WORD, background="white") output_status.grid(row=4, column=1, columnspan=1, sticky=W) output_status.delete(0.0,END) try: the_anime_link=scrapeit(the_name) defination=defineit(the_anime_link) status=stateit(the_anime_link) except: defination="Do you even watch Anime?" output.insert(END,defination) output_status.insert(END,status) #exit function def close_window(): window.destroy() exit() #exit label Label(window,text="Click here to exit\n", bg="black", fg="white", font="none 12 bold").grid(row=6, column=0,sticky=N+S+E+W) #exit button Button(window,text="Exit", width=14, command=close_window).grid(row=7, column=0,sticky=N+S+E+W) #mainloop window.mainloop()
from google_images_download import google_images_download options = {"keywords": None, "limit": None, "output_directory": None} google_obj = google_images_download.googleimagesdownload()
def search(query, limit): response = google_images_download.googleimagesdownload() arguments = {"keywords": query, "limit": limit, "print_urls": True} paths = response.download(arguments) print(paths)
def __init__(self): # intantiate google image client gic object self.gic = google_images_download.googleimagesdownload() log.basicConfig(filename="tophitslogs.log", level=log.DEBUG)
def download_images(keys, no_of_url_required): response = google_images_download.googleimagesdownload() #class instantiation arguments = {"keywords": keys,"limit":no_of_url_required,"print_urls":True} paths = response.download(arguments)
headers = { 'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:66.0) Gecko/20100101 Firefox/66.0" } #User agent header site_html = requests.get( img_url, headers).text #raw html data of the search results page soup = BS( site_html, "html.parser" ) #creating a BeautifulSoup object using the raw data and a html parser #This code will download the first thumbnail sized photo that it encounters for element in soup.findAll('img'): url = element.get("src") #gets the src link of image r = requests.get(url) #gets the url data with open("./img.png", "wb") as photo: #writes the url data to a file photo.write(r.content) break break #as google doesnt allow webscraping on its search results, we will use an application called google_images_download from google_images_download import google_images_download response = google_images_download.googleimagesdownload() #creating an object arguments = { "keywords": keyword, "format": "jpg", "limit": 1, "print_urls": True, "size": "medium" } #function parameters response.download(arguments) #downloading the image
def __init__(self, source, validation_split=0.2): self.source = addSlash(source) self.download_settings["output_directory"] = self.source response = google_images_download.googleimagesdownload() self.download = response.download
from google_images_download import google_images_download import json if __name__ == "__main__": all_diseases = [ 'Actinic keratoses', 'Basal cell carcinoma', 'Benign keratosis-like lesions', 'Dermatofibroma', 'Melanocytic nevi', 'Melanoma', 'Vascular lesions' ] image_scraper = google_images_download.googleimagesdownload() for disease_type in all_diseases: image_scraper.download({ 'keywords': f'"{disease_type}"', 'extract_metadata': True, 'language': 'English', 'limit': 100, 'no_directory': True, 'output_directory': f'scraped_data/{disease_type}'.replace(' ', '_').lower() })
def submittion(): global location location = location_entry.get() if len(location) == 0: messagebox.showinfo("Entry Error!", "Please Enter A Valid Location!") else: user_proof = username_get() response = google_images_download.googleimagesdownload() arguments = { "keywords": location + " beautiful city images", "limit": 5, "print_urls": False, "silent_mode": True, "format": "png", "no_directory": True, "output_directory": "C:/Users/aayus/Desktop/PROJECTS/LOGIN", "save_source": "paths", "exact_size": "320,160" } paths = response.download(arguments) with open("paths.txt", "r") as f: splits = f.read() content = splits.split("//") loc1 = content[1] loc2 = content[3] loc3 = content[5] loc4 = content[7] loc5 = content[9] req = loc1.split("\t") req2 = loc2.split("\t") req3 = loc3.split("\t") req4 = loc4.split("\t") req5 = loc5.split("\t") filepath1 = req[0] filepath2 = req2[0] filepath3 = req3[0] filepath4 = req4[0] filepath5 = req5[0] f.close() filepaths = [ filepath1, filepath2, filepath3, filepath4, filepath5 ] val = randint(0, 4) display = filepaths[val] location_pic = PhotoImage(file=display) pic_label = Label(wnd, image=location_pic) pic_label.config(image=location_pic) pic_label.grid(row=2, column=1) try: os.remove(filepath1) os.remove(filepath2) os.remove(filepath3) os.remove(filepath4) os.remove(filepath5) os.remove('paths.txt') winshell.recycle_bin().empty(confirm=False, show_progress=False, sound=False) print("Removed") except: print("Files don't exist.") wnd.update() wnd.mainloop()
def gimg_downloader(arguments): response = googleimagesdownload() path_ = response.download(arguments) return path_
def image_collector(topics, img_count, img_size): ####### DATA COLLECTION ####### # creates a data directory if one does not exist. we will store images here try: os.makedirs('data') except OSError as e: if e.errno != errno.EEXIST: raise # creates a 'train' and 'test' folders inside 'data' # if there are some already, it creates another different ones with a number get_ipython().system('cd data') count = 1 try: os.makedirs('data/train1') except: if os.path.exists('data/train1') == True: while True: try: os.makedirs('data/train' + str(count)) break except: count += 1 # creating 2 lists in case topics contain 2 words # topics_search keeps both words, to use for googleimagesdownload, if applicable topics_clean = [] topics_search = [] for topic in topics: try: topic = topic.replace(' ', '_') topics_clean.append(topic.split('_')[0]) topics_search.append(topic) except: topics_clean.append(topic) # creating new directory for each topic and collecting images for it print('Collecting images...') for topic_clean in topics_clean: try: os.makedirs('data/train' + str(count) + '/' + topic_clean) except OSError as e: if e.errno != errno.EEXIST: raise # getting the index from topics_clean, so that we can match it with topics, and use that instead to collect images topic_index = topics_clean.index(topic_clean) search_term = topics_search[topic_index] folder_name = topic_clean # image collection using google_images_download response = google_images_download.googleimagesdownload() arguments = { 'keywords': search_term, 'size': 'medium', 'limit': img_count, 'format': 'jpg', 'time_range': '{"time_min":"01/01/2018","time_max":"12/01/2018"}', 'output_directory': 'data/', 'image_directory': 'train' + str(count) + '/' + folder_name + '/', 'silent_mode': True, 'chromedriver': 'chromedriver.exe' } paths = response.download(arguments) ####### IMAGE PROCESSING ####### X = [] y = [] print('Processing images...') for topic_clean in tqdm(topics_clean): # opening images in color, resizing them, and making each one into an array for f in glob.glob( os.path.join('data/train' + str(count), topic_clean, '*.jpg')): try: img = Image.open(str(f)) img = img.convert('RGB') img = img.resize((img_size[0], img_size[1])) arr = image.img_to_array(img) # cropping images arr2d = extract_patches_2d(arr, patch_size=img_size) for crop in arr2d: X.append(crop) y.append(topic_clean) except: pass X = np.array(X) y = np.array(y) return X, y
def __init__(self): self.downloader = google_images_download.googleimagesdownload()
def model_tester(model, topics, img_size): # deletes validation folder, in case this function has been ran already try: shutil.rmtree('data/validation') except: pass # creates a validation folder os.makedirs('data/validation') topics_clean = [] topics_search = [] for topic in topics: try: topic = topic.replace(' ', '_') topics_clean.append(topic.split('_')[0]) topics_search.append(topic) except: topics_clean.append(topic) # collecting 1 image for each class for topic_clean in topics_clean: try: os.makedirs('data/validation/' + topic_clean) except OSError as e: if e.errno != errno.EEXIST: raise # getting the index from topics_clean, so that we can match it with topics, and use that instead to collect images topic_index = topics_clean.index(topic_clean) search_term = topics_search[topic_index] folder_name = topic_clean # hides output from terminal. google_images_download will clutter otherwise original_stdout = sys.stdout text_trap = io.StringIO() sys.stdout = text_trap # image collection using google_images_download, only images downloaded in 2014 to minimize posibility of duplicate images response = google_images_download.googleimagesdownload() arguments = { 'keywords': search_term, 'size': 'medium', 'limit': 3, 'format': 'jpg', 'time_range': '{"time_min":"01/01/2014","time_max":"12/01/2014"}', 'output_directory': 'data/', 'image_directory': 'validation/' + folder_name + '/', 'silent_mode': True, 'chromedriver': 'chromedriver.exe' } paths = response.download(arguments) # restores terminal output sys.stdout = original_stdout # for each image, show the image and use model to predict % chance for each class for topic_clean in topics_clean: topic_index = topics_clean.index(topic_clean) print('Image #' + str(topic_index + 1) + ' from ' + topic_clean + ' class: \n') list_of_img = glob.glob('data/validation/' + topic_clean + '/*') sorted_files = sorted(list_of_img, key=os.path.getmtime) path = sorted_files[0].replace('\\', '/') img = Image.open(path) img = img.convert('RGB') img = img.resize((img_size[0], img_size[1])) # making prediction on img img_pred = np.expand_dims(img, axis=0) pred = model.predict(img_pred) # setting up to show multiple images rows = len(topics) fig = plt.figure(figsize=(25, 25)) fig.add_subplot(rows, 1, topic_index + 1) plt.imshow(img) #print probability of each image being each class for sub_topic in topics: subtopic_index = topics.index(sub_topic) print( f' The model predicted there is a {round((pred[0][subtopic_index]) * 100, 2)} % chance this is a {sub_topic}' ) print('------------------------------------\t') return
def pic(place): response = google_images_download.googleimagesdownload() #class instantiation arguments = {"keywords":"%s %s" % (city, place),"limit":1,"print_urls":True} paths = response.download(arguments) img = Image(filename=paths[0][city + ' ' + place][0]) return img
def scrape_google_images(imageName, count): response = google_images_download.googleimagesdownload() arguments= {"keywords":imageName, "limit":count,"print_urls":True, "chromedriver":'C:\WebDrivers\chromedriver.exe', 'prefix':imageName} paths = response.download(arguments)
def download_pics(keyword, limit): ImageSearch = google_images_download.googleimagesdownload() SearchArgs = {"keywords": keyword, "limit": limit, "format": "jpg"} ImageSearch.download(SearchArgs) return
from google_images_download import google_images_download # Creates Image Object -> imgObject = google_images_download.googleimagesdownload() search_queries = [ 'juventus juan cuadrado', '''buffon goalkeeper juventus''', '''atletico madrid diego costa''', '''paris keylor navas''', ] def downloadImages(query): # Keyword arguments is the query that we give the application to run # The image format goes on the format # We then set the limit of the images # We set print URL, can also be set to save in a csv # The size of the image we want # Can be specified like the Google Image Tool ("large, medium, icon") # Aspect Ratio of the images to download. ("tall, square, wide, panoramic") arguments = { "keywords": query, "format": "gif", "limit": 100, "print_urls": True, "size": "medium", "aspect_ratio": "panoramic" } try:
def main(): # Reload the csv files from disk and store the data in a dataframe results = {} all_winners = {} categorie_data = {} best_catg_time = {} clean_awards = {} # Reload the wikidata from disk people = wikidata.call_wikidate('actors', 'actorLabel') people += wikidata.call_wikidate('directors', 'directorLabel') people += wikidata.call_wikidate('actresses', 'actorLabel') things = wikidata.call_wikidate('series', 'seriesLabel') people = [re.sub(r'[^\w\d\s]+', '', person_) for person_ in people] things = [re.sub(r'[^\w\d\s]+', '', thing_) for thing_ in things] # Load the csv files and clean data print("Load Dataframes") for year in resources.years: try: extractor = InfoExtractor() print("Start " + str(year) + " ...") print("Reading ...") extractor.read_dataframe("dirty_gg%s.csv" % year) print("Language ...") extractor.get_english_tweets("text", "language") print("Cleaning ...") extractor.clean_dataframe_column("text", "clean_upper") print("Lowering ...") extractor.make_to_lowercase("clean_upper", "clean_lower") print("Dropping ...") extractor.convert_time('timestamp_ms') extractor.drop_column("user") extractor.drop_column("id") extractor.drop_column("timestamp_ms") extractor.drop_column("language") resources.data[year] = extractor.get_dataframe() print("Finish " + str(year) + " ...") results[year] = {} except: print("Couldn't load Dataframes for" + str(year)) print("Done Dataframes\n") # We start by finding the awards for each year print("Find Awards") for year in resources.years: # try: chunker = Chunker() categorie_data[year] = resources.data[year].copy() categorie_data[year]['categorie'] = categorie_data[year].apply(chunker.extract_wrapper, axis=1) categorie_data[year] = categorie_data[year].loc[categorie_data[year].categorie != 'N/a', :] categorie_data[year].reset_index(drop=True, inplace=True) categorie_data[year] = categorie_data[year].loc[categorie_data[year].categorie.str.split().map(len) > 3, :] best_categories = chunker.pick_categories(categorie_data[year]) best_categories = chunker.filter_categories(best_categories) for i in best_categories: if 'actor' in i: i.replace('actor','performance by an actor') elif 'actress' in i : i.replace('actress','performance by an actress') if 'tv' in i: i.replace('tv','television') elif 'picture' in i and 'motion' not in i: i.replace('picture','motion picture') if 'series' in i and 'television' not in i: i.replace('series','television series') results[year]["Awards"] = best_categories # except: # print("Couldn't find awards for " + str(year)) print("Done Awards\n") # Find the point in time when an award took place print("Find Times") for year in resources.years: try: if year in [2013, 2015]: awards = resources.OFFICIAL_AWARDS_1315 else: awards = resources.OFFICIAL_AWARDS_1819 info_extract = InfoExtractor() for each_award in awards: clean_awards[each_award] = info_extract.clean_tweet(each_award) categorie_data[year]['real_categorie'] = categorie_data[year]['categorie'].apply(lambda x: fuzz_(x, clean_awards)) categorie_data[year] = categorie_data[year].loc[categorie_data[year]['real_categorie'] != 'N/a', :] categorie_data[year].reset_index(drop=True, inplace=True) data_catg = categorie_data[year].groupby(['hour', 'minute', 'real_categorie']).count()['clean_lower'].unstack().reset_index() data_catg = data_catg.dropna(how='all', axis=1) best_catg_time[year] = {} for each_ in list(data_catg.columns): if not each_ in ['hour', 'minute']: best_catg_time[year][each_] = [] max_idx = data_catg[each_].idxmax() best_catg_time[year][each_].append( (data_catg.iloc[max_idx - 2]['hour'], data_catg.iloc[max_idx - 2]['minute'])) best_catg_time[year][each_].append( (data_catg.iloc[max_idx - 1]['hour'], data_catg.iloc[max_idx - 1]['minute'])) best_catg_time[year][each_].append( (data_catg.iloc[max_idx]['hour'], data_catg.iloc[max_idx]['minute'])) best_catg_time[year][each_].append( (data_catg.iloc[max_idx + 1]['hour'], data_catg.iloc[max_idx + 1]['minute'])) best_catg_time[year][each_].append( (data_catg.iloc[max_idx + 2]['hour'], data_catg.iloc[max_idx + 2]['minute'])) except: print("Couldn't find times for " + str(year)) print("Finished Times for award ceremony\n") # We search for the hosts print("Searching for Host(s)") for year in resources.years: try: host_categorizer = TweetCategorizer([resources.HOST_WORDS], [], "host_tweet", resources.data[year], 0,resources.data[year].shape[0]) host_tweets = host_categorizer.get_categorized_tweets() hosters = host_categorizer.find_percentage_of_entities(host_tweets, 0.2, people, []) results[year]["Hosts"] = hosters[resources.HOST_WORDS] except: print("Couldn't find Hosts for " + str(year)) print("Found the Hosts!\n") # Search for the winners print("Searching for Winners...") for year in resources.years: try: all_winners[year] = [] awards = resources.OFFICIAL_AWARDS_1315 if year in [2018, 2019]: awards = resources.OFFICIAL_AWARDS_1819 winner_categorizer = TweetCategorizer(awards, resources.STOPWORDS, "award", resources.data[year], 3, resources.data[year].shape[0]) winner_tweets = winner_categorizer.get_categorized_tweets() winners = winner_categorizer.find_list_of_entities(winner_tweets, 1, people,things + wikidata.call_wikidate("films", "filmLabel",str(year - 2),str(year))) for key in winners: results[year][key] = {} if winners[key]: results[year][key]["Winner"] = winners[key][0] else: results[year][key]["Winner"] = "" all_winners[year].append(winners[key]) except: print("Couldn't find Winners for the year " + str(year)) print("Found all the Winners!\n") # Identifying the presenters for the specified year print("Searching for Presenters") for year in resources.years: try: for key, value in best_catg_time[year].items(): data_new = pd.DataFrame(columns=list(resources.data[year].columns)) for each_value in value: data_temp = resources.data[year].loc[(resources.data[year].hour == int(each_value[0])), :] data_temp = data_temp.loc[(data_temp.minute == int(each_value[1])), :] data_new = pd.concat([data_new, data_temp]) presenter_categorizer = TweetCategorizer([resources.PRESENTER_WORDS], [], "presenter_tweet", data_new, 0, data_new.shape[0]) presenter_tweets = presenter_categorizer.get_categorized_tweets() # presenters = find_names(presenter_tweets.clean_upper.tolist(),2,people,all_winners[year],results[year]["Hosts"]) presenters = presenter_categorizer.find_list_of_entities(presenter_tweets, 3, people, [], people=True) presenters = [p for p in presenters[list(presenters.keys())[0]] if (p not in all_winners[year]) and (p not in results[year]["Hosts"])] results[year][key]['Presenters'] = presenters[-3:] if year in [2013, 2015]: awards = resources.OFFICIAL_AWARDS_1315 else: awards = resources.OFFICIAL_AWARDS_1819 for each_ in awards: if not each_ in best_catg_time[year].keys(): results[year][each_]['Presenters'] = [] except: print("Couldn't find presenters for " + str(year)) print("Found the Presenters!\n") # Identify the nominees for each year print("Looking for Nominees...") for year in resources.years: try: for key, value in best_catg_time[year].items(): data_new = pd.DataFrame(columns=list(resources.data[year].columns)) for each_value in value: data_temp = resources.data[year].loc[(resources.data[year].hour == int(each_value[0])), :] data_temp = data_temp.loc[(data_temp.minute == int(each_value[1])), :] data_new = pd.concat([data_new, data_temp]) nominee_categorizer = TweetCategorizer([resources.NOMINEE_WORDS], [], "nominee_tweet", data_new, 0, data_new.shape[0]) nominee_tweets = nominee_categorizer.get_categorized_tweets() # presenters = find_names(presenter_tweets.clean_upper.tolist(),2,people,all_winners[year],results[year]["Hosts"]) if ('actress' in key.split()): nominees = nominee_categorizer.find_list_of_entities(nominee_tweets, 6, wikidata.call_wikidate('actresses', 'actorLabel'), [], people=True) elif ('actor' in key.split()): nominees = nominee_categorizer.find_list_of_entities(nominee_tweets, 6, wikidata.call_wikidate('actors', 'actorLabel'), [], people=True) elif ('director' in key.split()): nominees = nominee_categorizer.find_list_of_entities(nominee_tweets, 6, wikidata.call_wikidate('directors', 'actorLabel'), [], people=True) else: nominees = nominee_categorizer.find_list_of_entities(nominee_tweets, 6, [], things + wikidata.call_wikidate("films", "filmLabel", str(year - 2), str(year))) nominees = [p for p in nominees[list(nominees.keys())[0]] if (p not in all_winners[year]) and ( p not in results[year]["Hosts"] and (p not in results[year][key]['Presenters']))] results[year][key]['Nominees'] = nominees[-6:] if year in [2013, 2015]: awards = resources.OFFICIAL_AWARDS_1315 else: awards = resources.OFFICIAL_AWARDS_1819 for each_ in awards: if not each_ in best_catg_time[year].keys(): results[year][each_]['Nominees'] = [] except: print("Couldn't find nominees for the year" + str(year)) print("Found the Nominees!\n") # Searching for best and worst dress on the Red Carpet print("Looking for every mention of Dresses...") for year in resources.years: try: dress_categorizer = TweetCategorizer([resources.DRESS], [], "dress", resources.data[year], 0, resources.data[year].shape[0]) dress_tweets = dress_categorizer.get_categorized_tweets() best_dress_categorizer = TweetCategorizer([resources.BEST_DRESS], [], "best_dress", dress_tweets, 0, dress_tweets.shape[0]) best_dress_tweets = best_dress_categorizer.get_categorized_tweets() probs_best = best_dress_categorizer.list_probabilities(best_dress_tweets, 3, people, [], people=True) best_dressed = list(probs_best.keys()) representative_best_tweets = [] for b in best_dressed: for index, row in best_dress_tweets.iterrows(): if b in str(row["clean_upper"]): representative_best_tweets.append(str(row["text"])) break worst_dress_categorizer = TweetCategorizer([resources.WORST_DRESS], [], "worst_dress", dress_tweets, 0, dress_tweets.shape[0]) worst_dress_tweets = worst_dress_categorizer.get_categorized_tweets() probs_worst = worst_dress_categorizer.list_probabilities(worst_dress_tweets, 3, people, [], people=True) worst_dressed = list(probs_worst.keys()) representative_worst_tweets = [] for w in worst_dressed: for index, row in worst_dress_tweets.iterrows(): if w in str(row["clean_upper"]): representative_worst_tweets.append(str(row["text"])) break results[year]["BestDressed"] = probs_best results[year]["WorstDressed"] = probs_worst results[year]["BestDressedTweets"] = representative_best_tweets results[year]["WorstDressedTweets"] = representative_worst_tweets except: print("Couldn't find dresses for " + str(year)) print("Found best and worsed Dressed celebrites\n") # Looking for memorable moments from the award ceremony print("Finding the most memorable Moments...") for year in resources.years: try: moment_categorizer = TweetCategorizer([resources.MOMENTS], [], "moments", resources.data[year], 0, resources.data[year].shape[0]) moment_tweets = moment_categorizer.get_categorized_tweets() link_finder = re.compile(r'\bhttp[^\s ]+\b') results[year]["Moments"] = {} for type in resources.MOMENT_TYPES: type_categorizer = TweetCategorizer([type], [], "jokes", moment_tweets, 0, moment_tweets.shape[0]) type_tweets = type_categorizer.get_categorized_tweets() type_person = type_categorizer.find_list_of_entities(type_tweets, 1, people, [], people=True)[type] if len(type_person) > 0: type_person = type_person[0] results[year]["Moments"][type] = {} results[year]["Moments"][type]["Person"] = type_person for index, row in type_tweets.iterrows(): if type_person in str(row["clean_upper"]): results[year]["Moments"][type]["Tweet"] = str(row["text"]) break http_categorizer = TweetCategorizer(["http"], [], "links", type_tweets, 0, type_tweets.shape[0], column="text") http_tweets = http_categorizer.get_categorized_tweets() http_tweets = http_tweets.reset_index(drop=True) links = set() if (len(http_tweets) > 0): results[year]["Moments"][type]["Tweet"] = str(http_tweets["text"][0]) for index, row in http_tweets.iterrows(): matches = link_finder.findall(str(row["text"])) for m in matches: links.add(m) results[year]["Moments"][type]["Link"] = list(links)[:3] except: print("Couldn't find moments for the year " + str(year)) print("Found the best Moments!\n") # Finding people who were expected to win, but did not win print("Searching for the biggest Snubbed personalities from the show... ") for year in resources.years: try: snub_categorizer = TweetCategorizer([resources.SNUB], ["Golden", "Golden Globes", "Hollywood", "Globe Awards", "Disney","Oscar","Common"],"snub", resources.data[year], 0,resources.data[year].shape[0]) snub_tweets = snub_categorizer.get_categorized_tweets() most_snub_categorizer = TweetCategorizer([resources.SNUB], [], "most_snub", snub_tweets, 0,snub_tweets.shape[0]) most_snub_tweets = most_snub_categorizer.get_categorized_tweets() probs_most_snub = most_snub_categorizer.list_probabilities(most_snub_tweets, 2, people, [], people=True) most_snubbed = list(probs_most_snub.keys()) results[year]["Snubbed"] = probs_most_snub except: print("Couldn't find snubbed celebs for " + str(year)) print("Found all interestingly snubbed celebrities\n") # Preparing output files print("Write Markdown") markdown = "" for year in resources.years: markdown += "# " + str(year) + " Golden Globes\n" try: markdown += "## Hosts\n" for h in results[year]["Hosts"]: markdown += " - " + h + "\n" except: print("Couldn't write markdown hosts for " + str(year)) try: markdown += "## Best Dressed\n" i = 1 best_dressed = list(results[year]["BestDressed"].keys()) for b in best_dressed: markdown += " " + str(i) + ". " + b + " (" + str(results[year]["BestDressed"][b]) + ") " + "\n" i += 1 markdown += "\n" for b in best_dressed: response = google_images_download.googleimagesdownload() search = b + " " + str(year) + " Golden Globes Dress" arguments = {"keywords": search, "limit": 1,"format": "jpg", "print_urls": True} paths = response.download(arguments) print(paths) markdown += "<img src='file://" + paths[0][search][0] + "' height=300px alt='" + search + "'> " markdown += "\n" markdown += "\n" for b in results[year]["BestDressedTweets"]: markdown += b + " \n\n" markdown += "\n" except: print("Couldn't write markdown best dressed for " + str(year)) try: markdown += "## Worst Dressed\n" i = 1 worst_dressed = list(results[year]["WorstDressed"].keys()) for w in worst_dressed: markdown += " " + str(i) + ". " + w + " (" + str(results[year]["WorstDressed"][w]) + ") " + "\n" i += 1 markdown += "\n" for w in worst_dressed: response = google_images_download.googleimagesdownload() search = w + " " + str(year) + " Golden Globes Dress" arguments = {"keywords": search, "limit": 1, "print_urls": False} paths = response.download(arguments) print(paths) markdown += "<img src='file://" + paths[search][0] + "' height=300px alt='" + search + "'> " markdown += "\n" markdown += "\n" for w in results[year]["WorstDressedTweets"]: markdown += w + " \n\n" markdown += "\n" except: print("Couldn't write markdown worst dressed for " + str(year)) try: i=1 markdown += "## Who got Snubbed?\n" most_snubbed = list(results[year]["Snubbed"].keys()) for b in most_snubbed: if len(b.split())>1: markdown += " " + str(i) + ". " + b + " (" + str(results[year]["Snubbed"][b]) + ") " + "\n" i += 1 markdown += "\n" markdown += "\n" except: print("Couldn't write markdown snubbed for " + str(year)) try: markdown += "#### Awards found\n" for a in results[year]["Awards"]: markdown += " - " + a + "\n" except: print("Couldn't write markdown awards for " + str(year)) try: markdown += "## Moments\n" for moment in results[year]["Moments"]: markdown += "## " + moment.replace("|", " or ") + " moments\n" markdown += "##### Person:\n" markdown += "- " + results[year]['Moments'][moment]["Person"] + "\n" markdown += "##### Tweet:\n" markdown += "- " + results[year]['Moments'][moment]["Tweet"] + "\n" markdown += "##### Links:\n" for link in results[year]['Moments'][moment]["Link"]: markdown += "- " + link + "\n" markdown += "\n" except: print("Couldn't write markdown moments for the year" + str(year)) try: markdown += "## Awards\n" if year in [2013, 2015]: awards = resources.OFFICIAL_AWARDS_1315 else: awards = resources.OFFICIAL_AWARDS_1819 for cat in awards: markdown += "### " + cat + "\n" # Presenters markdown += "#####Presenters:\n" for a in results[year][cat]['Presenters']: markdown += "- " + a + "\n" # Nominees markdown += "\n#####Nominees:\n" for a in results[year][cat]['Nominees']: markdown += " - " + a + "\n" # Winner markdown += "\n#####Winner:\n" markdown += "- " + results[year][cat]['Winner'] + "\n" except: print("Couldn't write award results for the year " + str(year)) print("Completed Markdown!\n") print("Please run - python autograder.py ",str(year)) # Saving the final results as a Markdown (for easy access) with open('results.md', 'w') as file: file.write(markdown) # Saving the final results as JSON file (for autograder) with open("results.json", "w") as f: json.dump(results, f) return
def __init__(self, project_directory): self.response = google_images_download.googleimagesdownload() self.download_directory = project_directory
def main(snake_db_fp: Path): sneks_already_encountered = [] sneks_skipped = [] sneks_already_encountered_fp = Path("sneks_encountered.json") if sneks_already_encountered_fp.exists(): with sneks_already_encountered_fp.open() as f: j = json.load(f) sneks_already_encountered = j["encountered"] sneks_skipped = j["skipped"] with snake_db_fp.open() as f: snake_download_dir = Path.cwd() / "curated_downloads" snake_download_dir.mkdir(exist_ok=True) for snake in csv.DictReader(f): if (snake["index"] in sneks_already_encountered or snake["index"] in sneks_skipped): continue else: try: response = google_images_download.googleimagesdownload() downloads = list( response.download({ "keywords": f"{snake['genus']} {snake['species']} {snake['common_name']}", "limit": 4, })[0].values())[0] images = [Image.open(fp) for fp in downloads] widths, heights = zip(*(i.size for i in images)) total_width = sum(widths) max_height = max(heights) new_im = Image.new("RGB", (total_width, max_height)) x_offset = 0 for im in images: new_im.paste(im, (x_offset, 0)) x_offset += im.size[0] new_im.show() chosen_snake_idx = None while True: try: chosen_snake_idx = (int( input( f"Which snake do you choose? (Select: {list(range(1, len(images) + 1))}) " )) - 1) chosen_snake_fp = Path( downloads.pop(chosen_snake_idx)) except ValueError: raise NameError # skip except IndexError: pass # invalid input else: break chosen_snake_fp.replace( snake_download_dir / f"{snake['index']}{chosen_snake_fp.suffix.lower()}") sneks_already_encountered.append(snake["index"]) for other_download in downloads: Path(other_download).unlink() except NameError: sneks_skipped.append(snake["index"]) subprocess.check_call("""osascript -e \'quit app "Preview"\'""", shell=True) with sneks_already_encountered_fp.open("wt") as f: json.dump( { "encountered": sneks_already_encountered, "skipped": sneks_skipped, }, f, )
from google_images_download import google_images_download as gmd google = gmd.googleimagesdownload() ##################### # EDIT THIS SECTION # ##################### keywords = [{ "word": "family gathering, indian family, chinese family", "limit": 20 }, { "word": "china technology, china artificial island, london architecture, railway, busy city", "limit": 20 }, { "word": "business meeting real, real office, computer warehouse, amazon warehouse", "limit": 20 }, { "word": "manhattan street, india street, china street, street food, scotland village", "limit": 30 }, { "word": "scottish highlands, scottish castles", "limit": 20 }, { "word": "new year", "limit": 10 }]
from google_images_download import google_images_download #importing the library response = google_images_download.googleimagesdownload() #class instantiation arguments = { "keywords": "timber truck malaysia", "limit": 100, "print_urls": True } #creating list of arguments paths = response.download(arguments) #passing the arguments to the function print(paths)
def auto_video(query,summary,path_to_audio): # case in colab add: # !pip install setuptools cookiejar git+https://github.com/Joeclinton1/google-images-download.git dhash ffmpeg-python pydub import urllib3 import requests from bs4 import BeautifulSoup import numpy as np import os import ffmpeg import gdown import pandas as pd from os import listdir from PIL import Image from google_images_download import google_images_download import dhash from scipy.spatial.distance import hamming import json import re import os import math import numpy as np from pydub import AudioSegment print("###### init ######") title=re.sub(' ','_',query) title=re.sub(',','_',title) title=re.sub('\.','',title) query=re.sub('_',' ',title) response = google_images_download.googleimagesdownload() def downloadimages(query): # aspect ratio = the height width ratio of images to download ("tall, square, wide, panoramic") arguments = {"keywords": query, "format": "jpg", "limit":10, "print_urls":True, "size": "medium", "aspect_ratio":"panoramic"} try: response.download(arguments) except FileNotFoundError: arguments = {"keywords": query, "format": "jpg", "limit":4, "print_urls":True, "size": "medium"} try: response.download(arguments) except: pass print("###### downloading images ######") downloadimages(query) print("###### hashing images ######") def hashing(image,size=(25,25)): img = Image.open(image).convert("L") img2 = img.resize(size) row, col = dhash.dhash_row_col(img2) hash=dhash.format_hex(row, col) return(hash) print("###### saving hashes ######") dir='./downloads/' + query + '/' dirs = listdir(dir) print("###### calculating differences between images ######") hashes=[] for i in dirs: a=dir+i hash=hashing(a) hashes.append(hash) def dif_hashing(a,b): diff=dhash.get_num_bits_different(int(a,16),int(b,16)) return(diff) list_difs_hashings=[0] for i in range(len(hashes)): for j in range(len(hashes)): if i<j: diff_hash = dif_hashing(hashes[i],hashes[j]) if list_difs_hashings[0]==0: list_difs_hashings[0]=diff_hash elif list_difs_hashings[0]<diff_hash: list_difs_hashings.append(diff_hash) list_difs_hashings.sort() list_difs_hashings=list_difs_hashings[-4:] list_difs_hashings.sort(reverse=True) images=[] for i in range(len(hashes)): for j in range(len(hashes)): if i<j: diff_hash = dif_hashing(hashes[i],hashes[j]) if diff_hash in list_difs_hashings: if dirs[i] not in images: images.append(dirs[i]) if dirs[j] not in images: images.append(dirs[j]) if len(images)>4: images=images[:4] break if len(images)!=4: return("try again, less than 4 images") print("###### creating folders and saving the most different ones ######") os.mkdir('./downloads/diffimages') os.mkdir('./downloads/movie') for image in images: im=Image.open('./downloads/' + query + '/'+image) im.save('./downloads/diffimages/' + image) print("###### making a video out of images ######") SECONDS_BY_IMG=6 FRAMERATE=1/SECONDS_BY_IMG stream=ffmpeg.input('downloads/diffimages/'+'*.jpg', pattern_type='glob', framerate=FRAMERATE).output('downloads/movie/'+title+'.mp4').run() summary_list_parts = summary.split(".") list_absolute_subtitle_time_by_phrase=[] for phrase in summary_list_parts: list_absolute_subtitle_time_by_phrase.append(SECONDS_BY_IMG) endtime_for_each_sub = np.cumsum(list_absolute_subtitle_time_by_phrase) init_time_for_each_sub = endtime_for_each_sub-list_absolute_subtitle_time_by_phrase endtime_for_each_sub = endtime_for_each_sub.tolist() print(type(endtime_for_each_sub),type(endtime_for_each_sub[1])) print("###### create rst file from summary ######") subtitles_path='downloads/movie/subtitles_of_' f = open(subtitles_path + title + '.rst', "w") for i in range(len(summary_list_parts)): if endtime_for_each_sub[i] < 10: endtime_for_each_sub_ = "0"+str(endtime_for_each_sub[i]) else: endtime_for_each_sub_ = str(endtime_for_each_sub[i]) if init_time_for_each_sub[i] < 10: init_time_for_each_sub_ = "0"+str(init_time_for_each_sub.tolist()[i]) else: init_time_for_each_sub_ = str(init_time_for_each_sub[i]) f.write(str(i+1)+"\n00:00:"+init_time_for_each_sub_+",00 --> 00:00:"+endtime_for_each_sub_+",00\n"+summary_list_parts[i]+"\n") f.close() f = open(subtitles_path + title + '.rst', "r") print("###### setting environment variables ######") os.environ["VIDEO"] = './downloads/movie/'+title +'.mp4' os.environ["SUBTITLES"] = subtitles_path +title+'.rst' os.environ["VIDEO_WITH_SUBTITLES_AND_AUDIO"] = "./downloads/movie/subtitled_with_music_"+title +'.mp4' os.environ["VIDEO_WITH_SUBTITLES"] = "./downloads/movie/subtitled_"+title +'.mp4' os.environ["AUDIO"] = path_to_audio AUDIO = path_to_audio print("###### creating a video with subtitles ######") !ffmpeg -i $VIDEO -vf subtitles=$SUBTITLES $VIDEO_WITH_SUBTITLES sound = AudioSegment.from_mp3(path_to_audio) sound_trimmed = sound[:SECONDS_BY_IMG*4*1000] sound_trimmed.export(path_to_audio, format="mp3") print("############\n\n\n\n\n",len(sound_trimmed)) print("###### merging audio in video ######") !ffmpeg -i $VIDEO_WITH_SUBTITLES -i $AUDIO -c:v libx264 -vf format=yuv420p $VIDEO_WITH_SUBTITLES_AND_AUDIO #from google.colab import drive # drive.mount('/content/gdrive') # query = "Lifestyle choices can reduce risk for heartburn, study finds." # summary = "Women who make healthy lifestyle choices can significantly reduce the risk of heartburn. Other thing. ooooother thing. one more thing" # path_to_audio= 'gdrive/MyDrive/path/to/audio/audio.mp3' # auto_video(query,summary,path_to_audio)
def run_crawler(arguments): loader = google_images_download.googleimagesdownload() return loader.download(arguments)