Esempio n. 1
0
def losslessDecompress(f, group):
	paths = findDatasets(f, group, keyword="Events")
	paths.extend(findDatasets(f, group, keyword="Alignment"))
	paths.extend(findDatasets(f, "all", keyword="Signal", entry_point="Raw"))
	for path in paths:
		rewriteDataset(f, path)
	return "GZIP=1"
Esempio n. 2
0
def deepLosslessCompress(f, group):
	paths = findDatasets(f, group, "Events")
	paths = [path for path in paths if "Basecall" in path]
	# index event detection
	if "UniqueGlobalKey/channel_id" in f:
		sampleRate = f["UniqueGlobalKey/channel_id"].attrs["sampling_rate"]
		for path in paths:
			if f[path].parent.parent.attrs.__contains__("event_detection"):
				# index back to event detection
				dataset = f[path].value
				start = np.array([int(round(sampleRate * i)) for i in dataset["start"]])
				dataset = indexToZero(f, path, "start", dataColumn=start)
				move = dataset["move"] # rewrite move dataset because it's int64 for max 2
				# otherwise, event by event
				dataset = drop_fields(dataset, ["mean", "stdv", "length", "move"])
				dataset = append_fields(dataset, ["move"], [move], [getDtype(move)])
				rewriteDataset(f, path, compression="gzip", compression_opts=9, dataset=dataset)
				# rewrite eventdetection too - start is also way too big here
				eventDetectionPath = findDatasets(f, "all", entry_point=f[path].parent.parent.attrs.get("event_detection"))[0]
				if "picopore.start_index" not in f[eventDetectionPath].attrs.keys():
					eventData = indexToZero(f, eventDetectionPath, "start")
					rewriteDataset(f, eventDetectionPath, compression="gzip", compression_opts=9, dataset=eventData)
				
	if __basegroup_name__ not in f:
		f.create_group(__basegroup_name__)
		for name, group in f.items():
			if name != __basegroup_name__:
				recursiveCollapseGroups(f, __basegroup_name__, name, group)
	return losslessCompress(f, group)
Esempio n. 3
0
def deepLosslessDecompress(f, group):
	# rebuild group hierarchy
	if __basegroup_name__ in f.keys():
		uncollapseGroups(f, f[__basegroup_name__])	
	paths = findDatasets(f, group)
	paths = [path for path in paths if "Basecall" in path]
	sampleRate = f["UniqueGlobalKey/channel_id"].attrs["sampling_rate"]
	for path in paths:
		if f[path].parent.parent.attrs.__contains__("event_detection"):
			# index back to event detection
			dataset = f[path].value
			if "mean" not in dataset.dtype.names:
				eventDetectionPath = findDatasets(f, "all", entry_point=f[path].parent.parent.attrs.get("event_detection"))[0]
				eventData = f[eventDetectionPath].value
				try:
					start = eventData["start"] + f[eventDetectionPath].attrs["picopore.start_index"]
					del f[eventDetectionPath].attrs["picopore.start_index"]
					eventData = drop_fields(eventData, ["start"])
					eventData = append_fields(eventData, ["start"], [start], [getDtype(start)])
					rewriteDataset(f, eventDetectionPath, compression="gzip", compression_opts=1, dataset=eventData)
				except KeyError:
					# must have been compressed without start indexing
					pass
				try:
					start_index = f[path].attrs["picopore.start_index"]
					del f[path].attrs["picopore.start_index"]
				except KeyError:
					# must have been compressed without start indexing
					start_index=0
				start = dataset["start"][0] + start_index
				end = dataset["start"][-1] + start_index
				# constrain to range in basecall
				eventData = eventData[np.logical_and(eventData["start"] >= start, eventData["start"] <= end)]
				# remove missing events
				i=0
				keepIndex = []
				for time in dataset["start"]:
					while eventData["start"][i] != time + start_index and i < eventData.shape[0]:
						i += 1
					keepIndex.append(i)
				eventData = eventData[keepIndex]
				dataset = drop_fields(dataset, "start")
				start = [i/sampleRate for i in eventData["start"]]
				length = [i/sampleRate for i in eventData["length"]]
				dataset = append_fields(dataset, ["mean", "start", "stdv", "length"], [eventData["mean"], start, eventData["stdv"], length])	
				rewriteDataset(f, path, dataset=dataset)
	return losslessDecompress(f, group)