Beispiel #1
0
def normalize(d,
              bin_chr,
              bin_position,
              max_iter=1000,
              epsilon=0.0001,
              windowSize=1000.):
    """Return symmetric and fully balanced matrix using SinkhornKnopp"""
    # make symmetric & normalise
    d += d.T
    d -= np.diag(d.diagonal() / 2)
    #return d, bin_chr, bin_position
    # normalize by windows size
    sizes = np.diff(bin_position, axis=1)  #[:, 0]
    #c = Counter(sizes.reshape(len(sizes)))
    #windowSize, occurencies = c.most_common(1)[0]; print windowSize, occurencies
    #d *= 1. * windowSize / sizes
    #d *= windowSize **2 / (sizes*sizes.T)**0.5; print sizes.shape, sizes.T.shape #reshape(len(sizes),1))
    # full balancing
    sk = sinkhorn_knopp.SinkhornKnopp(max_iter=max_iter, epsilon=epsilon)
    d += 1
    d /= d.max()
    d = sk.fit(d)  #* 100000
    # 1 round balancing
    #sk = sinkhorn_knopp.SinkhornKnopp(max_iter=1); d += 1; d /= d.max(); d = sk.fit(d)
    '''
    axis = 1; d *= 1. * d.sum(axis=axis).max() / d.sum(axis=axis); print "axis %s norm"%axis #normalize_rows(d)
    '''
    return d, bin_chr, bin_position
def sinkhorn_temperature_sampling_distribution(langs,
                                               data_params_list,
                                               dataset_sizes,
                                               temp=1.0):
    r"""
    Convert dataset sizes into a distribution which takes into account both the
    availability of a particular lang pair together, as well as the
    availability of a particular lang alone across the pairs. We use the
    Sinkhorn-Knopp algorithm to convert a matrix of lang pair counts into 
    a doubly stochastic matrix, which is then converted into the temperature 
    sampled probabilities. 

    Motivation (section 3.4): https://arxiv.org/abs/2010.11125
    Sinkhorn-Knopp paper: http://msp.org/pjm/1967/21-2/pjm-v21-n2-p14-s.pdf
    Our fork of skp: https://github.com/kaleidoescape/sinkhorn_knopp
    """
    from sinkhorn_knopp import sinkhorn_knopp as skp

    #fill a matrix with language pair counts across datasets
    slangs = sorted(langs)
    A = np.zeros((len(slangs), len(slangs)))  #(src, tgt)
    for i, params in enumerate(data_params_list):
        src_lang, tgt_lang = params['src'], params['tgt']
        src_idx, tgt_idx = slangs.index(src_lang), slangs.index(tgt_lang)
        A[src_idx, tgt_idx] += dataset_sizes[i]
    logger.info(f"Data counts for {langs}: {A}")

    #if any row is fully 0, we have to remove the lang from both axes because
    #we need a square matrix with total support to perform sinkhorn-knopp
    #This will cause us to miss a lang that is ever only used as src or only
    #used as tgt, but for multiling models, we typically use both directions
    zero_rows = np.where(~A.any(axis=0))[0]
    if zero_rows.size > 0:
        [slangs.remove(langs[i]) for i in zero_rows]
        logger.warning(
            f"Ignoring all datasets for langs {[langs[i] for i in zero_rows]}"
            " because this lang is never used as the src")
    A = np.delete(A, zero_rows, 0)
    A = np.delete(A, zero_rows, 1)
    #also if any col is fully 0, we have to remove the lang from both axes
    zero_cols = np.where(~A.any(axis=1))[0]
    if zero_cols.size > 0:
        [slangs.remove(langs[i]) for i in zero_cols]
        logger.warning(
            f"Ignoring all datasets for langs {[langs[i] for i in zero_cols]}"
            " because this lang is never used as the tgt")
    A = np.delete(A, zero_cols, 0)
    A = np.delete(A, zero_cols, 1)
    if zero_rows.size > 0 or zero_cols.size > 0:
        logger.info(f"Remaining data counts for {langs}: {A}")

    #make matrix doubly stochastic (rows and cols each sum to 1)
    #and convert that into a new probability distrib with temperature
    sk = skp.SinkhornKnopp()
    probs = sk.fit(A)**(1 / temp)
    probs = probs / sum(probs)
    logger.info(f"Sinkhorn temperature sampled probs for {slangs}: {probs}")
    return probs, slangs
Beispiel #3
0
def normalize(d):
    """Return fully balanced matrix"""
    sk = sinkhorn_knopp.SinkhornKnopp()  #max_iter=100000, epsilon=0.00001)
    # make symmetric & normalise
    d += d.T
    d -= np.diag(d.diagonal() / 2)
    d += 1
    d = sk.fit(d)
    return d
Beispiel #4
0
def RandomTopology(num_agents):
    connectivity = []
    for j in range(num_agents):
        neighbor_agents = random.choices(list(range(num_agents)),
                                         k=random.randint(
                                             2, math.ceil(num_agents / 2)))
        neighbors = [
            1.0 if i in neighbor_agents or i == j else 0.0
            for i in range(num_agents)
        ]
        connectivity.append(neighbors)
    sk = skp.SinkhornKnopp()
    pi = sk.fit(connectivity)
    return connectivity, pi
Beispiel #5
0
def normalize(d, max_iter=1000, epsilon=0.00001):
    """Return fully balanced matrix"""
    print "SK norm 0-1"
    from sinkhorn_knopp import sinkhorn_knopp
    sk = sinkhorn_knopp.SinkhornKnopp(max_iter=max_iter, epsilon=epsilon)
    # make symmetric & normalise
    d += d.T - np.diag(d.diagonal())
    d += 1
    print d.max(), d.mean(), d.sum()
    vmax = d.max()
    d = sk.fit(d/vmax)
    #d *= vmax / d.max()
    print d.max(), d.mean(), d.sum()
    return d
Beispiel #6
0
def normalize(d, bin_chr, bin_position, max_iter=1000, epsilon=0.0001, windowSize=1000.):
    """Return symmetric and fully balanced matrix using SinkhornKnopp"""
    print "full sk balancing * dmax"
    # make symmetric & normalise
    d += d.T
    d -= np.diag(d.diagonal()/2)
    # full balancing
    sk = sinkhorn_knopp.SinkhornKnopp(max_iter=max_iter, epsilon=epsilon); 
    d += 1; 
    d /= d.max(); 
    d = sk.fit(d) * d.max()#* 100000
    # 1 round balancing
    #sk = sinkhorn_knopp.SinkhornKnopp(max_iter=1); d += 1; d /= d.max(); d = sk.fit(d)
    '''
    axis = 1; d *= 1. * d.sum(axis=axis).max() / d.sum(axis=axis); print "axis %s norm"%axis #normalize_rows(d)
    ''' 
    return d, bin_chr, bin_position
Beispiel #7
0
 def generate_doubly_stochastic_traffic(self):
     np.random.seed(self._seed)
     sk = sinkhorn_knopp.SinkhornKnopp()
     self._traffic_matrix = sk.fit(np.random.rand(self._size, self._size))
     self._traffic_matrix = self._traffic_matrix * self._load
     return self._traffic_matrix
Beispiel #8
0
def findProbMatrix(hits, type):

    scores = []
    publications = {}
    current_score = 0
    max_score = hits[0]['_score']
    for hit, i in zip(hits, range(len(hits))):
        score = hit['_score'] / max_score
        scores.append(score)
        publication = hit['_source']['publication']
        if publication in publications:
            info = publications.get(publication)
            info.increaseValue(score, i)
        else:
            publications[publication] = PublicationInfo(score, 1, [i])

    norm = np.array(scores)
    u = np.around(norm, decimals=2)
    print(u)

    v = np.array([1.0 / (np.log(2 + i)) for i, _, in enumerate(u)])
    P = cp.Variable((len(u), len(u)))
    objective = cp.Maximize(cp.matmul(cp.matmul(u, P), v))
    constraints = [
        cp.matmul(np.ones((1, len(u))), P) == np.ones((1, len(u))),
        cp.matmul(P, np.ones((len(u), ))) == np.ones((len(u), )), 0 <= P,
        P <= 1
    ]

    list_publications = list(publications.keys())
    for publication in list_publications:
        list_publications.remove(publication)
        occurrences = publications.get(publication).getOccurrences()
        positions1 = publications.get(publication).getPositions()
        for second_publication in list_publications:
            values = []
            positions = []
            for i in positions1:
                if type == 1:
                    values.append(1 / occurrences)
                elif type == 2:
                    values.append(
                        1 / round(publications.get(publication).getScore(), 0))
                else:
                    values.append(
                        round(scores[i], 0) /
                        round(publications.get(publication).getScore(), 0))
            second_occurrences = publications.get(
                second_publication).getOccurrences()
            positions2 = publications.get(second_publication).getPositions()
            for j in positions2:
                if type == 1:
                    values.append(-1 / second_occurrences)
                elif type == 2:
                    values.append(-1 / round(
                        publications.get(second_publication).getScore(), 0))
                else:
                    values.append(-round(scores[j], 0) / round(
                        publications.get(second_publication).getScore(), 0))
            positions = positions1 + positions2
            print(values)
            positions.sort()
            constraints.append(
                cp.matmul(cp.matmul(np.array(values), P[positions]), v) == 0)

    prob = cp.Problem(objective, constraints)
    result = prob.solve(solver=cp.SCS)

    p_matrix = P.value
    print(p_matrix)

    print("MATRICE PROBABILITA %s\n" % p_matrix)
    for i in range(p_matrix.shape[0]):
        for j in range(p_matrix.shape[1]):
            if p_matrix[i][j] < 0:
                p_matrix[i][j] = 0

    p_matrix = np.around(p_matrix, decimals=4)

    sk = skp.SinkhornKnopp()

    ## I try to find the matrix each time by transposing it. This allows you to better adapt the values
    ## change them just a little such that the sum is 1. Thanks to https://github.com/btaba/sinkhorn_knopp
    ## I use 1000 iterations for train the system
    for i in range(1000):
        p_matrix = sk.fit(p_matrix.T)
    print(np.sum(p_matrix, axis=0))
    print(np.sum(p_matrix, axis=1))
    print('\n %s' % p_matrix)
    return p_matrix
Beispiel #9
0
def double_stochastic_norm_sinkhorn(similarity):
    # guardare il loro codice
    sk = skp.SinkhornKnopp()
    DS = sk.fit(similarity)
    return torch.from_numpy(DS).float()
Beispiel #10
0
Datei: test.py Projekt: Coni63/SO
from scipy.spatial.distance import squareform, pdist
import pandas as pd
from sklearn import datasets
from sinkhorn_knopp import sinkhorn_knopp as skp
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

iris = datasets.load_iris()

X = iris.data
y = iris.target

D = squareform(pdist(X, 'sqeuclidean'))
P = np.exp(-D)

sk = skp.SinkhornKnopp()
P = sk.fit(P)

no_dims = 3
n = X.shape[0]
min_gain = 0.01
momentum = 0.5
final_momentum = 0.8
epsilon = 500
mom_switch_iter = 250
max_iter = 1000

P[np.diag_indices_from(P)] = 0.

P = ( P + P.T )/2