def test_create_and_move_blob(self): gs_file = StorageFile("gs://aries_test/new_file.txt") self.assertFalse(gs_file.blob.exists()) gs_file.create() self.assertTrue(gs_file.blob.exists()) dest = "gs://aries_test/moved_file.txt" gs_file.move(dest) self.assertFalse(gs_file.exists()) dest_file = StorageFile(dest) self.assertTrue(dest_file.exists()) dest_file.delete()
def get_file_size(self, path, job_name=None, **kwargs): # Ignore local paths if self.__get_file_protocol(path) == "Local": logging.warning(f"Ignoring path '{path}' as it is local on the disk image. Assuming the path is present!") return True try: # Check if path is prefix, and create StoragePrefix object and get its size if path.endswith("*"): _size = StoragePrefix(path.rstrip("*")).size # Check if it path exists as a file or folder, by creating StorageFile and StorageFolder object else: _file = StorageFile(path) _folder = StorageFolder(path) if _file.exists(): _size = _file.size elif _folder.exists(): _size = _folder.size else: _size = 0 # Convert to GB return float(_size)/2**30 except BaseException as e: logging.error(f"Unable to get file size: {path}") if str(e) != "": logging.error(f"Received the following msg:\n{e}") raise
def transfer_file(to_folder_uri, file_id=None, file_info_href=None): # Determine the file_id, file_info and file_content_href if file_id is not None: file_info_href = "v1pre3/files/%s" % file_id file_content_href = "v1pre3/files/%s/content" % file_id elif file_info_href is not None: file_id = file_info_href.strip("/").split("/")[-1] file_content_href = "%s/content" % file_info_href else: raise ValueError("Either BaseSpace file_id or file_info_href is needed for file transfer.") file_info = api_response(file_info_href) logger.debug("Transferring file from BaseSpace: %s" % file_content_href) # For FASTQ files, add basespace file ID to filename # Each MiSeq run may have multiple FASTQ files with the same name. filename = file_info.get("Name") if filename.endswith(".fastq.gz"): filename = filename.replace(".fastq.gz", "_%s.fastq.gz" % file_id) # Skip if a file exists and have the same size. to_uri = os.path.join(to_folder_uri, filename) dest_file = StorageFile(to_uri) file_size = file_info.get("Size") if file_size and dest_file.exists() and dest_file.size and dest_file.size == file_info.get("Size"): logger.debug("File %s exists at destination: %s" % (filename, to_uri)) return to_uri from_uri = build_api_url(file_content_href) StorageFile(from_uri).copy(to_uri) return to_uri
def get_intraday_series(self, symbol, date=None): """Gets a pandas data frame of intraday series data. Args: symbol (str): The name of the equity/stock. date (str, optional): Date, e.g. 2017-02-12. Defaults to None. Returns: A pandas data frame of intraday series data for the specific date. If date is None, the data of the last trading day will be returned. This function will return None, if date is None and there is no data available in the last 100 days. """ series_type = self.intraday_series_type # requested_date stores the original requested date requested_date = date day_delta = 0 df = None # When date is specified, empty data frame will be return if there is no data for the specific day. # When date is not specified, try to get data of the previous day if there is no data today while df is None or (requested_date is None and df.empty and day_delta < 100): if requested_date is None: date = (datetime.datetime.now() - datetime.timedelta(days=day_delta)).strftime( self.date_fmt) logger.debug("Getting data for %s" % date) # Get the next date as string for filtering purpose # next_date is a string of date, which will be used to compare with data frame index. dt_date = datetime.datetime.strptime(date, self.date_fmt) dt_next = dt_date.date() + datetime.timedelta(days=1) next_date = dt_next.strftime(self.date_fmt) if self.cache: # Check if data has been cached. file_path = self.__cache_file_path(symbol, series_type, date) storage_file = StorageFile(file_path) if storage_file.exists(): logger.debug("Reading existing data... %s" % file_path) with storage_file('r') as f: df = pd.read_csv(f, index_col=0, parse_dates=['timestamp']) else: df = self.__intraday_get_full_data(symbol) df = df[(df['timestamp'] >= date) & (df['timestamp'] < next_date)] else: # Request new data df = self.__request_data(symbol, series_type, 'full') df = df[(df['timestamp'] >= date) & (df['timestamp'] < next_date)] day_delta += 1 if df is not None: df.set_index('timestamp', inplace=True) df.symbol = symbol return df
def test_binary_read_write(self): # File does not exist, a new one will be created file_uri = os.path.join(self.TEST_ROOT, "test.txt") storage_file = StorageFile(file_uri).open("wb") self.assertEqual(storage_file.scheme, self.SCHEME) self.assertTrue(storage_file.seekable()) self.assertFalse(storage_file.readable()) self.assertEqual(storage_file.write(b"abc"), 3) self.assertEqual(storage_file.tell(), 3) self.assertEqual(storage_file.write(b"def"), 3) self.assertEqual(storage_file.tell(), 6) storage_file.close() self.assertTrue(storage_file.exists()) storage_file.open('rb') self.assertEqual(storage_file.read(), b"abcdef") storage_file.close() storage_file.delete() self.assertFalse(storage_file.exists())
def get_file_size(self, path, job_name=None, **kwargs): retry_count = kwargs.get("retry_count", 0) # Ignore local paths if self.__get_file_protocol(path) == "Local": logging.warning(f"Ignoring path '{path}' as it is local on the disk image. Assuming the path is present!") return True if retry_count < 5: try: # Check if path is prefix, and create StoragePrefix object and get its size if path.endswith("*"): _size = StoragePrefix(path.rstrip("*")).size # Check if it path exists as a file or folder, by creating StorageFile and StorageFolder object else: _file = StorageFile(path) _folder = StorageFolder(path) _size = 0 found = False trial_count = 0 while not found: if trial_count > 10: logging.error(f"Cannot get size of '{path}' as it doesn't exist after multiple trials!") break time.sleep(trial_count) if _file.exists(): _size = _file.size found = True elif _folder.exists(): _size = _folder.size found = True else: trial_count += 1 logging.warning(f"Cannot get size of '{path}' as it does not exist! Trial {trial_count}/10") # Convert to GB return float(_size)/2**30 except BaseException as e: logging.error(f"Unable to get file size: {path}") if str(e) != "": logging.error(f"Received the following msg:\n{e}") if "dictionary changed size" in str(e): kwargs['retry_count'] = retry_count + 1 return self.get_file_size(path, job_name, **kwargs) raise else: logging.warning(f"Failed to get size of '{path}'! Attempted to retrieve size {retry_count + 1} times.") return 0
def test_http(self): """ """ # URL does not exist storage_obj = StorageFile("http://example.com/abc/") self.assertFalse(storage_obj.exists()) # URL exists storage_obj = StorageFile("https://www.google.com") self.assertTrue(storage_obj.exists()) # Download. Copy to local file. storage_obj = StorageFile("https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf") local_file_path = os.path.join(self.test_folder_path, "test.pdf") if os.path.exists(local_file_path): os.remove(local_file_path) storage_obj.copy(local_file_path) self.assertTrue(os.path.exists(local_file_path)) self.assertGreater(StorageFile(local_file_path).size, 0) StorageFile(local_file_path).delete()
def test_create_copy_and_delete_file(self): new_folder_uri = os.path.join(self.TEST_ROOT, "new_folder") with TempFolder(new_folder_uri) as folder: self.assertTrue(folder.is_empty()) # Create a sub folder inside the new folder sub_folder_uri = os.path.join(new_folder_uri, "sub_folder") logger.debug(sub_folder_uri) sub_folder = StorageFolder(sub_folder_uri).create() self.assertTrue(sub_folder.exists()) # Copy an empty file src_file_path = os.path.join(self.TEST_ROOT, "test_folder_0", "empty_file") dst_file_path = os.path.join(new_folder_uri, "copied_file") f = StorageFile(src_file_path) logger.debug(f.exists()) time.sleep(2) f.copy(dst_file_path) self.assertTrue(StorageFile(dst_file_path).exists()) # Copy a file with content and replace the empty file src_file_path = os.path.join(self.TEST_ROOT, "test_folder_0", "abc.txt") dst_file_path = os.path.join(new_folder_uri, "copied_file") f = StorageFile(src_file_path) f.copy(dst_file_path) dst_file = StorageFile(dst_file_path) self.assertTrue(dst_file.exists()) # Use the shortcut to read file, the content will be binary. self.assertEqual(dst_file.read(), b"abc\ncba\n") # Empty the folder. This should delete file and sub folder only folder.empty() self.assertTrue(folder.exists()) self.assertTrue(folder.is_empty()) self.assertFalse(sub_folder.exists()) self.assertFalse(dst_file.exists())
def __get_valid_daily_cache(self, symbol): """Gets the latest un-expired cache file for daily data. Args: symbol (str): The symbol of the equity/stock. Returns: str: File path if an un-expired cache file exists. Otherwise None. """ for i in range(self.daily_cache_expiration): d = datetime.datetime.now() - datetime.timedelta(days=i) file_path = self.__cache_file_path(symbol, self.daily_series_type, d.strftime(self.date_fmt)) storage_file = StorageFile(file_path) if storage_file.exists(): return storage_file return None