def download_ncodes_image_net(self, path_to_save=None): """ This function is used to download ncodes for image-net dataset and restructuring of the same in an csv file. Args: path_to_save {str}: folder path where the ncodes csv is to be saved. Returns {str}: absolute path to ncodes csv. """ # parameter initialization ncodes = pd.DataFrame([]) path_to_save = os.path.realpath(os.path.dirname(__file__)) if path_to_save==None else path_to_save ncodes_path = os.path.join(path_to_save, "ncodes.csv") if not os.path.exists(ncodes_path): print("\nDownloading imagenet ncodes...") # download ncodes data with anchor(self.NCODES_DATA_URL) as response: html = response.read() soup = BeautifulSoup(html, features="lxml") for code_num, link in enumerate(tqdm(soup.findAll('a'))): code = (link.get('href').split("wnid=")[-1]) values = link.contents[0] ncodes.loc[code_num, "code"] = code ncodes.loc[code_num, "name"] = values # add relevant columns to ncode data ncodes["to_download"] = False ncodes["how_many"] = -1 ncodes.to_csv(ncodes_path, index=False) # save ncode data print("Download complete !\n") else: print("\nncodes data already present in the mentioned folder") return ncodes_path
def get_an_image(self, url_data, folder_path, queue_, max_time=7): """ This function is used to download an image from its corresponding url. Args: url_data {http-link}: url for the image to be downloaded. folder_path {str}: folder where the downloded image will be saved. queue_{multiprocessing queue}: multiprocessing queue to push the result after asynchronous Returns: None """ # initializing variables url = url_data[0] image_code = url_data[1] _, ext = os.path.splitext(url) ext = ".jpg" if ext.lower() not in self.IMAGE_FORMATS else ext # starting the image download download_start_time = time.time() try: # opening from the url with anchor(url, timeout=max_time) as request: with open(os.path.join(folder_path, "Images", image_code + ext), 'wb') as f: try: # writing the image to the disk f.write(request.read()) except Exception as e: #saving error req_time = round(time.time() - download_start_time,1) download_json = self.create_report_json( url=url, image_code=image_code, status="SaveError", spent_time=req_time, error_description=str(e) ) queue_.put(download_json) return # download success req_time = round(time.time() - download_start_time, 1) download_json = self.create_report_json( url=url, image_code=image_code, status="done", spent_time=req_time, error_description="Success" ) queue_.put(download_json) except Exception as e: # save data if error during download process req_time = round(time.time() - download_start_time, 1) download_json = self.create_report_json( url=url, image_code=image_code, status="Error", spent_time=req_time, error_description=str(e) ) queue_.put(download_json)
def download_speed(self): """ This function is used to calculate network download speed Returns {float}: download speed of the connected internet network """ try: # parameter initialization URL = "http://speedtest.ftp.otenet.gr/files/test1Mb.db" FILE_SIZE = 1048.576 # downlaod a file to test download speed start = time.time() file = anchor(URL, timeout=7) file.read() end = time.time() time_difference = end - start return round(FILE_SIZE / time_difference) except: return False
def ncode_image_download_sequentially(self, ncode_data, folder_path, verbose=True, max_time=7): """ This function is used to sequentially download images for a particular ncode(object category). Args: ncode_data {pandas-dataframe}: image-net image urls data for the particular ncode. folder_path {str}: absolute path where downloaded images are to be saved. verbose {bool}: bool represent wheather to show stats for this ncode(object category) data download. max_time {int}: maximum download time that is permitted for an image url. Returns: ncode_report {pandas-dataframe}: data containing comprehensive summary about every download. """ # variable intialization stats_report = list() ncode_report = list() # initiating sequential image download for image_num, url in enumerate( tqdm(ncode_data["img_url"], file=sys.stdout)): image_code = ncode_data.loc[image_num, "img_code"] _, ext = os.path.splitext(url) ext = ".jpg" if ext.lower() not in self.IMAGE_FORMATS else ext # check if image already present if os.path.exists( os.path.join(folder_path, "Images", image_code + ext)): download_json = self.create_report_json( url=url, image_code=image_code + ext, status="done", spent_time="-", error_description="Already present") stats_report.append("done") ncode_report.append(download_json) continue # starting the image download download_start_time = time.time() try: # opening from the url with anchor(url, timeout=max_time) as request: with open( os.path.join(folder_path, "Images", image_code + ext), 'wb') as f: try: # writing the image to the disk f.write(request.read()) except Exception as e: # saving error req_time = round(time.time() - download_start_time, 1) download_json = self.create_report_json( url=url, image_code=image_code + ext, status="SaveError", spent_time=req_time, error_description=str(e)) stats_report.append(download_json["status"]) ncode_report.append(download_json) continue # download success req_time = round(time.time() - download_start_time, 1) download_json = self.create_report_json( url=url, image_code=image_code + ext, status="done", spent_time=req_time, error_description="Success") except Exception as e: # save data if error during download process req_time = round(time.time() - download_start_time, 1) download_json = self.create_report_json( url=url, image_code=image_code + ext, status="Error", spent_time=req_time, error_description=str(e)) stats_report.append(download_json["status"]) ncode_report.append(download_json) # stats frequency distribution stats_report_json = { x: stats_report.count(x) for x in set(stats_report) } _, category_name = os.path.split(folder_path) # stats reporting print() total = sum(stats_report_json.values()) if verbose: if "done" in stats_report_json.keys(): print("Success/Present : {}/{}".format( stats_report_json["done"], total)) self.report['total'].append(total) if "Error" in stats_report_json.keys(): http_error = stats_report_json["Error"] if verbose: print("HTTP/URL/No-image errors : {}/{}".format( http_error, total)) self.report['error'].append(http_error) if "SaveError" in stats_report_json.keys(): save_error = stats_report_json["SaveError"] if verbose: print("Writing errors : {}/{}\n".format(save_error, total)) self.report['save_error'].append(save_error) return ncode_report