def _analyze_barcode(gzip_fastq, json_stats, logger): logger.debug(f"Analyzing barcode in {gzip_fastq}") logger.debug("Counting reads by barcode...") # fastq = File.unzip(gzip_fastq) fastq = gzip_fastq.replace(".gz", "") with gzip.open(gzip_fastq, 'rb') as gzip_file: with open(fastq, "wb") as unzipped_file: logger.debug("Unzipping %s to %s ..." % (gzip_fastq, fastq)) block_size = 1 << 20 while True: block = gzip_file.read(block_size) if not block: break unzipped_file.write(block) barcode_stats = IlluminaFASTQ(fastq).count_by_barcode() logger.debug(f"Barcode count: {len(barcode_stats.keys())}") with StorageFile.init(json_stats, 'w') as fp: json.dump(barcode_stats, fp) return json_stats
def test_text_read(self): with StorageFile.init( os.path.join(self.TEST_ROOT, "file_in_test_folder")) as f: self.assertEqual(f.size, 0) self.assertEqual(f.tell(), 0) self.assertEqual(f.seek(0, 2), 0) self.assertEqual(len(f.read()), 0)
def test_gs_read_seek(self): # GSFile instance with StorageFile.init("gs://aries_test/file_in_root.txt") as gs_file: self.assertEqual(gs_file.scheme, "gs") # self.assertEqual(str(type(gs_file).__name__), "GSFile") self.assertTrue(gs_file.seekable()) self.assertTrue(gs_file.readable()) self.assertEqual(gs_file.size, 34)
def __save_data_frame(self, df, symbol, series_type): if df.empty: logger.info("Data frame is empty.") return None file_path = self.__cache_file_path(symbol, series_type) logger.debug("Saving %s rows to... %s" % (len(df), file_path)) with StorageFile.init(file_path, 'w') as f: df.to_csv(f) return file_path
def test_text_read_write(self): # Write a new file temp_file_path = os.path.join(self.TEST_ROOT, "temp_file.txt") with StorageFile.init(temp_file_path, 'w+') as f: self.assertTrue(f.writable()) self.assertEqual(f.tell(), 0) self.assertEqual(f.write("abc"), 3) self.assertEqual(f.tell(), 3) f.seek(0) self.assertEqual(f.read(), "abc") # TODO: File may not exist on the cloud until it is closed. # self.assertTrue(f.exists()) f.delete()
def __intraday_get_full_data(self, symbol): """Gets the most recent intraday data (which may include data of multiple days.) Args: symbol (str): The symbol of the equity/stock. Returns: A pandas data frame of intraday series data. """ series_type = self.intraday_series_type cached_file = self.__intraday_valid_cache(symbol) if cached_file: logger.debug("Reading cached file: %s" % cached_file.uri) with cached_file('r') as f: df = pd.read_csv(f, index_col=0, parse_dates=['timestamp']) return df df = self.__request_data(symbol, series_type, 'full', interval="1min") file_path = os.path.join(self.cache, self.__intraday_cache_file_prefix(symbol)) \ + datetime.datetime.now().strftime(self.intraday_time_fmt) logger.debug("Saving intraday data...") with StorageFile.init(file_path, 'w') as f: df.to_csv(f) # Group data by date groups = df.groupby(df['timestamp'].dt.normalize()) # Get the latest date in the data frame dates = [str(name).split(" ")[0] for name, _ in groups] latest = max(dates) for name, group in groups: date = str(name).split(" ")[0] # The data for a date is complete if there is data at 1600 or the date is not the latest one if not group[group.timestamp == date + " 16:00:00"].empty or date < latest: date_file_path = self.__cache_file_path( symbol, series_type, date) with StorageFile.init(date_file_path, 'w') as f: group.reset_index(drop=True).to_csv(f) return df
def __init__(self, uri, annotation_uri): super().__init__(uri) self.content = StorageFile.init(uri).read() if isinstance(self.content, bytes): self.content = self.content.decode() self.content = self.content.split("\n") self.headers = [] self.variants = [] self.annotations = self.load_annotation(annotation_uri) for line in self.content: if not line: continue if line.startswith("#"): self.headers.append(line) else: key = self.variant_key(line) self.variants.append(Variant(line, self.annotations.get(key)))
def peek_barcode(self): barcode_dict = {} with StorageFile.init(self.file_path, 'rb') as f: with gzip.GzipFile(fileobj=f) as gz: for i, line in enumerate(gz, start=1): if i > 4000: break # The line containing barcode starts with @ if not line.startswith(b"@"): continue if isinstance(line, bytes): line = line.decode() # Raw barcode barcode = line.strip().split(":")[-1] if re.match(self.dual_index_pattern, barcode): barcode = self.convert_barcode(barcode) barcode_dict[barcode] = self.__count_barcode(barcode_dict, barcode, i) return barcode_dict
def create_file(cls, relative_path, content): """Creates a file relative to the test root """ abs_path = os.path.join(cls.TEST_ROOT, relative_path) with StorageFile.init(abs_path, "w") as f: f.write(content)
def read_count(self): logger.debug("Counting reads in file %s..." % self.uri) self.gzip = gzip.GzipFile( fileobj=StorageFile.init(self.uri, "rb").local()) return len(list(self))
def __init__(self, uri): self.uri = uri self.gzip = gzip.GzipFile(fileobj=StorageFile.init(uri, "rb")) self.current = 0