def losslessDecompress(f, group): paths = findDatasets(f, group, keyword="Events") paths.extend(findDatasets(f, group, keyword="Alignment")) paths.extend(findDatasets(f, "all", keyword="Signal", entry_point="Raw")) for path in paths: rewriteDataset(f, path) return "GZIP=1"
def deepLosslessCompress(f, group): paths = findDatasets(f, group, "Events") paths = [path for path in paths if "Basecall" in path] # index event detection if "UniqueGlobalKey/channel_id" in f: sampleRate = f["UniqueGlobalKey/channel_id"].attrs["sampling_rate"] for path in paths: if f[path].parent.parent.attrs.__contains__("event_detection"): # index back to event detection dataset = f[path].value start = np.array([int(round(sampleRate * i)) for i in dataset["start"]]) dataset = indexToZero(f, path, "start", dataColumn=start) move = dataset["move"] # rewrite move dataset because it's int64 for max 2 # otherwise, event by event dataset = drop_fields(dataset, ["mean", "stdv", "length", "move"]) dataset = append_fields(dataset, ["move"], [move], [getDtype(move)]) rewriteDataset(f, path, compression="gzip", compression_opts=9, dataset=dataset) # rewrite eventdetection too - start is also way too big here eventDetectionPath = findDatasets(f, "all", entry_point=f[path].parent.parent.attrs.get("event_detection"))[0] if "picopore.start_index" not in f[eventDetectionPath].attrs.keys(): eventData = indexToZero(f, eventDetectionPath, "start") rewriteDataset(f, eventDetectionPath, compression="gzip", compression_opts=9, dataset=eventData) if __basegroup_name__ not in f: f.create_group(__basegroup_name__) for name, group in f.items(): if name != __basegroup_name__: recursiveCollapseGroups(f, __basegroup_name__, name, group) return losslessCompress(f, group)
def deepLosslessDecompress(f, group): # rebuild group hierarchy if __basegroup_name__ in f.keys(): uncollapseGroups(f, f[__basegroup_name__]) paths = findDatasets(f, group) paths = [path for path in paths if "Basecall" in path] sampleRate = f["UniqueGlobalKey/channel_id"].attrs["sampling_rate"] for path in paths: if f[path].parent.parent.attrs.__contains__("event_detection"): # index back to event detection dataset = f[path].value if "mean" not in dataset.dtype.names: eventDetectionPath = findDatasets(f, "all", entry_point=f[path].parent.parent.attrs.get("event_detection"))[0] eventData = f[eventDetectionPath].value try: start = eventData["start"] + f[eventDetectionPath].attrs["picopore.start_index"] del f[eventDetectionPath].attrs["picopore.start_index"] eventData = drop_fields(eventData, ["start"]) eventData = append_fields(eventData, ["start"], [start], [getDtype(start)]) rewriteDataset(f, eventDetectionPath, compression="gzip", compression_opts=1, dataset=eventData) except KeyError: # must have been compressed without start indexing pass try: start_index = f[path].attrs["picopore.start_index"] del f[path].attrs["picopore.start_index"] except KeyError: # must have been compressed without start indexing start_index=0 start = dataset["start"][0] + start_index end = dataset["start"][-1] + start_index # constrain to range in basecall eventData = eventData[np.logical_and(eventData["start"] >= start, eventData["start"] <= end)] # remove missing events i=0 keepIndex = [] for time in dataset["start"]: while eventData["start"][i] != time + start_index and i < eventData.shape[0]: i += 1 keepIndex.append(i) eventData = eventData[keepIndex] dataset = drop_fields(dataset, "start") start = [i/sampleRate for i in eventData["start"]] length = [i/sampleRate for i in eventData["length"]] dataset = append_fields(dataset, ["mean", "start", "stdv", "length"], [eventData["mean"], start, eventData["stdv"], length]) rewriteDataset(f, path, dataset=dataset) return losslessDecompress(f, group)