def kMeans(k, e, i, o): comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() #when rank = 0, divides the strands and scatters to each node if (rank == 0): f = open(i, "r") strands = [] for line in f.readlines(): strand = line.strip() strands.append(strand) f.close() chunkSize = len(strands)/size chunk = [strands[i*chunkSize: min((i+1)*chunkSize, len(strands))] for i in range(size)] #pick centroids and then broadcast newCen = DS.getInitialCentroids(strands, k) else: newCen = None chunk = None # every other node gets the broadcasted newCen from root newCen = comm.bcast(newCen, root=0) # every node gets its chunk of data chunk = comm.scatter(chunk, root=0) oldCen = [""]*k membership = [-1]*len(chunk) allMembership = [] # repeat until convergence while(DS.diffCentroids(oldCen, newCen) > e): oldCen = newCen[:] # each node then computes membership based on the current centroids membership = assignMembership(chunk, newCen) allMembers = comm.gather(membership, root=0) #flatten the list of lists if (rank == 0): allMembership = [] #flatten the list of lists for member in allMembers: # based on newmembership calculate new centroids allMembership.extend(member) newCen = updateCentroids(allMembership, k) #broadcast newCentroids to every node newCen = comm.bcast(newCen, root=0) # write output file if (rank == 0): fo = open(o, "w+") for c in newCen: fo.write(c+'\n') fo.close()
def assignMembership(strands, centroids): k = len(centroids) membership = [] for strand in strands: minDistance = float("inf") for c in xrange(k): d = DS.distance(strand, centroids[c]) if (d < minDistance): minDistance = d minCluster = c membership.append((minCluster, strand)) return membership