Exemple #1
0
def cacheDBscanClusters(cfg):
	cacheFileName = dbscan.getDBscanClustersCacheFileName(cfg)
	if os.path.isfile(cacheFileName):
		return

	isolates = pyroprinting.loadIsolates(cfg)

	clusters = dbscan.computeDBscanClusters(isolates, cfg)

	with open(cacheFileName, mode='w+b') as cacheFile:
		pickle.dump(clusters, cacheFile)
Exemple #2
0
def cacheNeighbors(cfg):
	cacheFileName = fullsearch.getNeighborsMapCacheFileName(cfg)
	if os.path.isfile(cacheFileName):
		return

	isolates = pyroprinting.loadIsolates(cfg)

	neighbors = fullsearch.computeNeighborsMap(isolates, cfg)

	avgMatches = sum(len(matches) for _, matches in neighbors.items()) / len(neighbors)
	print("avgMatches: {}".format(avgMatches))

	with open(cacheFileName, mode='w+b') as cacheFile:
		pickle.dump(neighbors, cacheFile)
def loadFromCSV(filename, outfile):
	cfg = config.loadConfig()
	isolates = pyroprinting.loadIsolates(cfg)
	isolateIdMap = {iso.name.strip(): iso for iso in isolates}
	clusters = []

	with open(filename) as csvFile:
		csvLines = "".join(line for line in csvFile if line.strip()).splitlines(True) # remove blank lines
		# print(csvLines)
		csvReader = csv.reader(csvLines, delimiter=',')
		pastHeader = False # because Aldrin's csv files have some header rows
		currentClusterId = None
		currentCluster = None

		for i, row in enumerate(csvReader):
			# print("{}/{}".format(i+1, len(csvLines)))
			if row[0] == "Cluster Id":
				pastHeader = True
			elif pastHeader:
				if row[0].startswith("Threshold:") or row[0] == "******":
					print("Multiple clusterings detected in file. Skipping the rest.")
					break

				isoId = row[1].strip()
				if isoId in isolateIdMap:
					if row[0] != currentClusterId:
						currentClusterId = row[0]
						currentCluster = set()
						clusters.append(currentCluster)
					currentCluster.add(isolateIdMap[isoId])
				else:
					print("extra isolate: {}".format(isoId))

	# print(clusters)
	print(len(clusters))

	with open(outfile, mode='w+b') as cacheFile:
		pickle.dump(clusters, cacheFile)
Exemple #4
0
		len([size for size in clusterSizes if size >= 128])
	)

	noise = [isolate for isolate in isolates if isolate not in clusterMap]

	return count, minmax, (mean, math.sqrt(var)), len(noise), sizeHistogram

def filterSingletonClusters(clusters):
	filtered = [cluster for cluster in clusters if len(cluster) > 1]
	print("{}/{}".format(len(filtered), len(clusters)))
	return filtered

if __name__ == '__main__':
	cfg = config.loadConfig()
	assert cfg.isolateSubsetSize == "Shared"
	isolates = pyroprinting.loadIsolates(cfg)

	# dbscanClusters = dbscan.getDBscanClusters(isolates, cfg)
	dbscan1Clusters = dbscan.loadDBscanClustersFromFile("dbscanShared_0.995_0.995_1.pickle")
	dbscan3Clusters = dbscan.loadDBscanClustersFromFile("dbscanShared_0.995_0.995_3.pickle")
	ohclust99Clusters = filterSingletonClusters(importClusters.getOHClustClusters(99))
	# ohclust995Clusters = filterSingletonClusters(importClusters.getOHClustClusters(995))
	agglomerativeClusters = filterSingletonClusters(importClusters.getAgglomerativeClusters())
	replicatePearsons = loadReplicatePearsons(cfg)

	db1Pair = ("DBSCAN 1", dbscan1Clusters)
	db3Pair = ("DBSCAN 3", dbscan3Clusters)
	oh99Pair = ("OHClust 99", ohclust99Clusters)
	# oh995Pair = ("OHClust 995", ohclust995Clusters)
	aggPair = ("AGGLOMERATIVE", agglomerativeClusters)