def sparse_data_example():
    """ 
  cluster sparse data points.
  """
    ndata = 20000
    sizes = np.array(npr.randint(5, 9, size=(ndata, )), dtype=np.uint64)
    indices_s = []
    for i in range(ndata):
        new_indices = random.sample(xrange(10), sizes[i])
        new_indices.sort()
        indices_s.extend(new_indices)

    indices_s = np.array(indices_s, dtype=np.uint64)
    data = npr.randn(sizes.sum())
    z = pyzentas.pyzen(K=100,
                       max_rounds=10000,
                       max_itok=15.001,
                       seed=npr.randint(1011),
                       with_tests=False)
    do_refinement = True
    refinement_algorithm = "yinyang"
    rf_max_rounds = 3000
    rf_max_time = 10000.

    X = z.spa(sizes, indices_s, data, do_refinement, refinement_algorithm,
              rf_max_rounds, rf_max_time)
def mnist_example():
    """
  Showing how do_vdimap can help. 
  """

    import mnist
    reload(mnist)

    ndata = int(1e3)
    X = mnist.read_MNIST(dataset="original", ndata=ndata) / 1000.
    dimension = X[0].shape[-1]
    npr.seed(1000)

    z = pyzentas.pyzen(K=100,
                       metric='l2',
                       energy='quadratic',
                       exponent_coeff=0,
                       max_itok=10000,
                       max_rounds=20,
                       seed=1011,
                       nthreads=1,
                       init="kmeans++-5",
                       with_tests=False,
                       patient=False)
    do_vdimap = True
    tangerine = z.den(X, do_vdimap, do_refinement=True)
def tests():
    """
  run some tests to confirm the algorithms are working correctly. 
  """

    #Test 1
    seed = 107
    npr.seed(seed)
    data = 10 * npr.rand(25, 2)
    K = 7
    z = pyzentas.pyzen(K=K,
                       metric='l2',
                       algorithm="clarans",
                       level=3,
                       energy='quadratic',
                       exponent_coeff=0,
                       max_time=2,
                       max_rounds=30,
                       seed=1011,
                       nthreads=1,
                       with_tests=True,
                       patient=False,
                       init="uniform",
                       do_balance_labels=False,
                       rooted=False)
    tangerine = z.den(data, False, False)

    #Test 2
    z = pyzentas.pyzen(init=np.arange(K),
                       K=K,
                       algorithm="voronoi",
                       level=0,
                       metric='l2',
                       energy='identity',
                       exponent_coeff=0,
                       max_time=1,
                       max_rounds=40,
                       seed=1011,
                       nthreads=1,
                       with_tests=True,
                       patient=False)
    tangerine = z.den(data, True)
def dense_data_example():
    """
  cluster dense data points.
  """

    import random
    npr.seed(1011)
    random.seed(1011)
    K = int(1e2)
    ndata = int(7e3)
    dimension = 5
    data = np.array(1 + npr.randn(ndata, dimension), dtype=np.float64)
    seed = 1011
    z = pyzentas.pyzen(init="kmeans++~1",
                       K=K,
                       metric='l2',
                       energy='quadratic',
                       exponent_coeff=0,
                       max_rounds=20,
                       max_time=1.,
                       max_itok=3.0,
                       seed=seed,
                       nthreads=1,
                       patient=False,
                       with_tests=False,
                       algorithm="clarans",
                       level=3)
    do_vdimap = False
    do_refinement = True
    refinement_algorithm = "yinyang"
    rf_max_rounds = 3000
    rf_max_time = 10000.
    tangerine = z.den(data, do_vdimap, do_refinement, refinement_algorithm,
                      rf_max_rounds, rf_max_time)

    run_eakmeans = False
    if run_eakmeans:
        sys.path.append(datapaths.datapaths["eaklibdir"])
        import kmeans
        indices = random.sample(xrange(ndata), K)
        indices = np.array(indices, dtype=np.uint64)
        indices.sort()
        X = kmeans.get_clustering(X=data,
                                  n_clusters=K,
                                  init=indices,
                                  verbose=1,
                                  n_threads=1,
                                  algorithm="yin-sn")
def from_file_example():
    """
  read lists of words from files 
  and cluster using Normalised Levenshtein.
  """

    root = "../data/"
    filenames_list = [root + "words1.txt", root + "words2.txt"]
    outfilename = root + "output1.txt"
    #the costs of indels and switches
    costfilename = root + "costs.txt"

    z = pyzentas.pyzen(K=5,
                       metric='normalised levenshtein',
                       max_proposals=1000,
                       energy='cubic',
                       seed=npr.randint(1000))
    tangerine = z.txt_seq(filenames_list, outfilename + "N", costfilename)
def generated_sequence_example():
    """
  generate random sequences of chars/ints and cluster using levenshtein
  """
    ndata = 2000

    sizes = np.array(npr.randint(10, 30, size=ndata), dtype=np.uint64)

    #the values of the sequences
    data = []
    usechars = False
    for i in range(sizes.sum()):
        if npr.rand() < 0.25:
            data.append('A' if usechars else 0)
        elif npr.rand() < 0.5:
            data.append('C' if usechars else 1)
        elif npr.rand() < 0.75:
            data.append('G' if usechars else 2)
        else:
            data.append('T' if usechars else 3)

    data = np.array(data, dtype='c' if usechars else np.int32)

    #The cost of mutating chars/ints (a 4x4 matrix)
    cost_switch = np.array(
        [[0., 10, 8, 11], [10, 0., 10, 9], [8, 10, 0, 10], [11, 9, 10, 0.]],
        dtype=np.float64)

    #The cost of inserting or deleting a char/int
    cost_indel = np.array([10, 11, 10, 9], dtype=np.float64)

    z = pyzentas.pyzen(K=400,
                       metric='levenshtein',
                       max_proposals=100000,
                       max_rounds=2,
                       energy='quadratic',
                       seed=npr.randint(1000),
                       nthreads=1,
                       init="kmeans++-2")
    tangerine = z.seq(sizes=sizes,
                      values=data,
                      cost_indel=cost_indel,
                      cost_switch=cost_switch)
def get_results():
  """
  cluster dense data points.
  """

  if always_run_from_scratch == False and os.path.exists(datapaths.datapaths["comparing_levels_fn"]):
    X = np.load(datapaths.datapaths["comparing_levels_fn"])
    results = {}
    for l in levels:
      results[l] = X[()][l]
  
  else:
    results = {}
    for level in levels:
      z = pyzentas.pyzen(init = "uniform", K = K, metric = 'l2', energy = 'quadratic', exponent_coeff = 0,  max_rounds = 20000, max_time = 60, seed = seed, nthreads = 4, patient = True, with_tests = False, algorithm = "clarans", level = level, capture_output = True)
    
      tangerine = z.den(data, do_vdimap = False, do_refinement = False)
      results[level] = pyzentas.get_processed_output(tangerine['output'])

  
  return results
def capture_example():
    """
  extracting statistics from runs, with plots
  """

    import matplotlib.pyplot as pl
    K = int(1e3)
    ndata = int(1e4)
    dimension = 3
    #so that distance to nearest ~ 1
    centers = K**(1. / dimension) * npr.rand(K, dimension)
    indices = npr.randint(K, size=(ndata, ))
    data = centers[indices] + 0.5 * npr.randn(ndata, dimension)

    pl.clf()
    pl.ion()

    for max_itok in [x / 2. for x in range(10)]:
        for init in [
                "kmeans++-5"
        ]:  #["kmeans++-10", "uniform"]:# "kmeans++", "afk-mc2-10", "afk-mc2-100", ]:

            print "with ", init, max_itok, "."
            z = pyzentas.pyzen(K=K,
                               capture_output=True,
                               init=init,
                               max_itok=max_itok,
                               seed=1011,
                               level=3)
            tangerine = z.den(data, do_vdimap=False, do_refinement=True)
            oo = pyzentas.get_processed_output(tangerine['output'])
            pl.plot(oo['tT'],
                    oo['mE'],
                    label=init + "[" + str(max_itok) + "]",
                    linestyle='none',
                    marker='.')

    #pl.xscale('log')
    pl.xlabel("time [ms]")
    pl.ylabel("mean energy")
def afk_mc2_subopt_mode():
    """
  We present an example where afk-mc^2 requires extremely
  long chain lengths to match k-means++. There are K centers,
  K-1 are uniformly distributed in [0,1] x [0,1], and the
  K'th center is at (10,10). Data points are drawn as follows:
  (1) select one of the K centers at random, 
  (2) add N(0,sigma) to it, where sigma is small (see below).
  
  Essentially, afk-mc^2 reduces to k-mc^2 
  (see Bachem 2016 for algorithms).
  """

    #################
    # Generate data #
    #################
    K = 200
    ndata = K * 50
    dimension = 2
    centers = npr.rand(K, 2)
    centers[-1] = [10, 10]

    sigma = 1e-5
    labels = npr.randint(K, size=(ndata, ))
    data = centers[labels] + sigma * npr.randn(ndata, 2)
    data /= sigma

    ################
    # Run kmeans++ #
    ################
    E_kmeanspp = []
    print "kmeans++ energies: ",
    for i in range(5):
        z = pyzentas.pyzen(K=K,
                           metric='l2',
                           energy='quadratic',
                           seed=npr.randint(1000),
                           max_rounds=0,
                           init="kmeans++-1",
                           capture_output=True)
        tangerine = z.den(data)
        E_kmeanspp.append(
            tangerine['output'].split("R=0")[1].split("mE=")[1].split()[0])
        print E_kmeanspp[-1], " ",

    print "\nwith afk-mc2:"

    ###############
    # Run afk-mc2 #
    ###############
    E_afkmc2 = []
    l_afkmc2 = []
    for i in range(30):
        chain_length = i**2
        z = pyzentas.pyzen(K=K,
                           metric='l2',
                           energy='quadratic',
                           seed=npr.randint(1000),
                           max_rounds=0,
                           init="afk-mc2-%d" % (chain_length, ),
                           capture_output=True)
        tangerine = z.den(data)
        E_afkmc2.append(
            tangerine['output'].split("R=0")[1].split("mE=")[1].split()[0])
        l_afkmc2.append(chain_length)
        print "chain length: ", l_afkmc2[-1], "\tenergy: ", E_afkmc2[-1]

    return {
        "E_kmeanspp": E_kmeanspp,
        "E_afkmc2": E_afkmc2,
        "l_afkmc2": l_afkmc2
    }
def go(X, K, withskl, witheak, withzen):
    """
  X : data
  K : number of clusters
  withskl, witheak, withzen : bools indicating whether to run with em.
  """
    indices_init = np.arange(K, dtype=np.uint64)
    C_init = X[indices_init]

    results = {}
    if withskl == True:
        results["skl"] = {}
        from sklearn.cluster import KMeans
        # run until convergence, initialise with scikit-learn's special version of k-means++ (see zentas wiki entry for discussion).
        sklc = KMeans(n_clusters=K,
                      init="k-means++",
                      max_iter=100000000,
                      tol=1e-20,
                      verbose=0,
                      n_init=1)
        tsk0 = time.time()
        sklc.fit(X)
        tsk1 = time.time()
        sklacc = np.sum(
            np.min(np.sum((np.expand_dims(X, axis=1) -
                           np.expand_dims(sklc.cluster_centers_, axis=0))**2,
                          axis=2),
                   axis=1)) / X.shape[0]
        results["skl"]["t"] = tsk1 - tsk0
        results["skl"]["mse"] = sklacc

    if witheak:
        results["eak"] = {}
        sys.path.append(datapaths.datapath["eaklibdir"])
        import kmeans
        teak0 = time.time()
        eak = kmeans.get_clustering(X,
                                    K,
                                    verbose=1,
                                    init="kmeans++",
                                    n_threads=4)
        teak1 = time.time()
        results["eak"]["t"] = teak1 - teak0
        results["eak"]['mse'] = eak["mse"]

    if withzen:
        results["zen"] = {}
        # run with zentas. pipeline here is (1) kmeans++ (2) clarans (3) lloyd.
        z = pyzentas.pyzen(K=K,
                           metric='l2',
                           energy='quadratic',
                           max_itok=10.0,
                           max_time=5.0,
                           max_proposals=K**2,
                           seed=npr.randint(1000),
                           patient=True,
                           nthreads=4,
                           init="kmeans++-4",
                           with_tests=False,
                           capture_output=True,
                           rooted=False)
        tzen0 = time.time()
        tangerine = z.den(X,
                          do_vdimap=True,
                          do_refinement=True,
                          rf_max_rounds=10000000)
        tzen1 = time.time()
        results["zen"]["t"] = tzen0 - tzen1
        results["zen"]["out"] = pyzentas.get_processed_output(
            tangerine['output'])
        results["zen"]['mse'] = results["zen"]["out"]["mE"][-1]

    return results
    X = npr.randn(N, dim)
    results = {}

    for K in [int(x) for x in 2**np.linspace(np.log2(40), np.log2(2500), 40)]:

        results[K] = {}
        max_itoks = [0, 1, 2, 4]
        for max_itok in max_itoks:
            results[K][max_itok] = {}
            z = pyzentas.pyzen(K=K,
                               metric='l2',
                               energy='quadratic',
                               max_itok=max_itok,
                               max_time=100000,
                               max_proposals=K**2,
                               seed=1011,
                               patient=True,
                               nthreads=4,
                               init="kmeans++-4",
                               with_tests=False,
                               capture_output=True,
                               rooted=False)

            tzen0 = time.time()
            tangerine = z.den(X,
                              do_vdimap=True,
                              do_refinement=True,
                              rf_max_rounds=10000000)
            tzen1 = time.time()
            results[K][max_itok]["t"] = tzen0 - tzen1
            results[K][max_itok]["out"] = pyzentas.get_processed_output(
def go(X, K, alg, seed, kmpp_greedy):
    """
  One run.
  X : data
  K : number of clusters
  """
    results = {}

    if "kmpp-cla" in alg:
        max_itok = float(alg.split("-")[-1])

        z = pyzentas.pyzen(K=K,
                           metric='l2',
                           energy='quadratic',
                           max_itok=max_itok,
                           max_time=500.0,
                           max_proposals=K**2,
                           seed=seed,
                           patient=True,
                           nthreads=4,
                           init="kmeans++-6",
                           with_tests=False,
                           capture_output=True,
                           rooted=False)

    elif alg == "kmpp":

        if (kmpp_greedy > 1):
            init = "kmeans++~%d" % (kmpp_greedy)

        else:
            init = "kmeans++-6"

        z = pyzentas.pyzen(K=K,
                           metric='l2',
                           energy='quadratic',
                           max_itok=0.0,
                           max_time=5.0,
                           max_proposals=K**2,
                           seed=seed,
                           patient=True,
                           nthreads=4,
                           init=init,
                           with_tests=False,
                           capture_output=True,
                           rooted=False)

    else:
        raise RuntimeError("unrecognised alg in go")

    tzen0 = time.time()
    if X.shape[1] > 15:
        rf_alg = "yinyang"
        do_vdimap = True
    else:
        rf_alg = "exponion"
        do_vdimap = False

    tangerine = z.den(X,
                      do_vdimap=do_vdimap,
                      do_refinement=True,
                      rf_alg=rf_alg,
                      rf_max_rounds=100)
    tzen1 = time.time()
    results["t"] = tzen1 - tzen0
    results["out"] = pyzentas.get_processed_output(tangerine['output'])
    results['mse'] = results["out"]["mE"][-1]

    return results