def cacheDBscanClusters(cfg): cacheFileName = dbscan.getDBscanClustersCacheFileName(cfg) if os.path.isfile(cacheFileName): return isolates = pyroprinting.loadIsolates(cfg) clusters = dbscan.computeDBscanClusters(isolates, cfg) with open(cacheFileName, mode='w+b') as cacheFile: pickle.dump(clusters, cacheFile)
def cacheNeighbors(cfg): cacheFileName = fullsearch.getNeighborsMapCacheFileName(cfg) if os.path.isfile(cacheFileName): return isolates = pyroprinting.loadIsolates(cfg) neighbors = fullsearch.computeNeighborsMap(isolates, cfg) avgMatches = sum(len(matches) for _, matches in neighbors.items()) / len(neighbors) print("avgMatches: {}".format(avgMatches)) with open(cacheFileName, mode='w+b') as cacheFile: pickle.dump(neighbors, cacheFile)
def loadFromCSV(filename, outfile): cfg = config.loadConfig() isolates = pyroprinting.loadIsolates(cfg) isolateIdMap = {iso.name.strip(): iso for iso in isolates} clusters = [] with open(filename) as csvFile: csvLines = "".join(line for line in csvFile if line.strip()).splitlines(True) # remove blank lines # print(csvLines) csvReader = csv.reader(csvLines, delimiter=',') pastHeader = False # because Aldrin's csv files have some header rows currentClusterId = None currentCluster = None for i, row in enumerate(csvReader): # print("{}/{}".format(i+1, len(csvLines))) if row[0] == "Cluster Id": pastHeader = True elif pastHeader: if row[0].startswith("Threshold:") or row[0] == "******": print("Multiple clusterings detected in file. Skipping the rest.") break isoId = row[1].strip() if isoId in isolateIdMap: if row[0] != currentClusterId: currentClusterId = row[0] currentCluster = set() clusters.append(currentCluster) currentCluster.add(isolateIdMap[isoId]) else: print("extra isolate: {}".format(isoId)) # print(clusters) print(len(clusters)) with open(outfile, mode='w+b') as cacheFile: pickle.dump(clusters, cacheFile)
len([size for size in clusterSizes if size >= 128]) ) noise = [isolate for isolate in isolates if isolate not in clusterMap] return count, minmax, (mean, math.sqrt(var)), len(noise), sizeHistogram def filterSingletonClusters(clusters): filtered = [cluster for cluster in clusters if len(cluster) > 1] print("{}/{}".format(len(filtered), len(clusters))) return filtered if __name__ == '__main__': cfg = config.loadConfig() assert cfg.isolateSubsetSize == "Shared" isolates = pyroprinting.loadIsolates(cfg) # dbscanClusters = dbscan.getDBscanClusters(isolates, cfg) dbscan1Clusters = dbscan.loadDBscanClustersFromFile("dbscanShared_0.995_0.995_1.pickle") dbscan3Clusters = dbscan.loadDBscanClustersFromFile("dbscanShared_0.995_0.995_3.pickle") ohclust99Clusters = filterSingletonClusters(importClusters.getOHClustClusters(99)) # ohclust995Clusters = filterSingletonClusters(importClusters.getOHClustClusters(995)) agglomerativeClusters = filterSingletonClusters(importClusters.getAgglomerativeClusters()) replicatePearsons = loadReplicatePearsons(cfg) db1Pair = ("DBSCAN 1", dbscan1Clusters) db3Pair = ("DBSCAN 3", dbscan3Clusters) oh99Pair = ("OHClust 99", ohclust99Clusters) # oh995Pair = ("OHClust 995", ohclust995Clusters) aggPair = ("AGGLOMERATIVE", agglomerativeClusters)