Beispiel #1
0
def computeCorrelation(pairs):
    X = []
    Y = []
    for pair in pairs:
        X.append(pair[1][0])
        Y.append(pair[1][1])

    return correlation(X, Y)
    def evaluateSimilarity(self, corpusFileName, outputFileName):
        print("Evaluating Word Similarity")
        machine_scores = []
        human_scores = []
        not_found = 0
        words_not_found = []

        output = open(outputFileName, 'w')
        output.write("# Word 1\tWord 2\tHuman (mean)\tMachine\n")

        with open(corpusFileName) as corpus_lines:
            for corpus_line in corpus_lines:
                if corpus_line[0] == "#":
                    continue

                # Reading word from the corpus
                line = {}
                line['tag'], line['word_1'], line['word_2'], line[
                    'human_score'] = corpus_line.rstrip().split('\t')

                # Retrieving the vectors of the words
                if line['word_1'] not in self.glove:
                    not_found += 1
                    words_not_found.append(line['word_1'])
                    continue

                if line['word_2'] not in self.glove:
                    not_found += 1
                    words_not_found.append(line['word_2'])
                    continue

                word1_vec = np.array(self.glove[line['word_1']])
                word2_vec = np.array(self.glove[line['word_2']])

                # Computing the score based on the two vectors
                machine_score = cos_sim(word1_vec.reshape(1, -1),
                                        word2_vec.reshape(1, -1))[0][0] * 10

                machine_scores.append(machine_score)

                # Human score
                human_scores.append(float(line['human_score']))

                # the pair, the human score, and the word embeddings score, and the overall correlation.
                o = '\t'.join([
                    line['tag'], line['word_1'], line['word_2'],
                    line['human_score'],
                    str(round(machine_score, 4))
                ])
                output.write(o + '\n')

        # Evaluate score - compute correlation of the two scores
        evaluation = correlation(human_scores, machine_scores)
        evaluation = round(evaluation[0], 4)
        output.write("# Correlation = " + str(evaluation) + "\n")
        output.close()
        print("Evaluation complete.")
        return evaluation
Beispiel #3
0
    def reconstruction_wrapper(alpha_1):

        # Call reconstruction routine --------
        (first_g, g_list, x_array_plot, y_array_plot) = reconstruction(time,
                                                                       alpha_1=alpha_1,
                                                                       alpha_2=alpha_1,
                                                                       alpha_3=1.,
                                                                       alpha_4=0)

        # Compare with the phantom model -----
        return -correlation(g_list[-1].flatten(), phantom_model)[0]
def compute_correlation(data, distance, prototype_policies, num_prototypes, iterations, verbose=False, size_limit=1000):
    print "Computing distance matrix and similarity matrix (original space):",
    data_original = data
    if data.shape[0] > size_limit:
        print
        print "Datset too big: subsampling to %s entries only!" % size_limit
        data = data[np.random.permutation(data.shape[0])[:size_limit], :]
    od = distance(data, data)     
    print od.shape
    original_distances = squareform(od)

    rho = np.zeros((len(prototype_policies), len(num_prototypes),iterations))

    for m, prototype_policy in enumerate(prototype_policies):
        print prototype_policy
        for j, num_proto in enumerate(num_prototypes):
            print "number of prototypes:", num_proto, " - ", 
            for k in range(iterations):
                print k,
                stdout.flush()
                if verbose: print("Generating %s prototypes as" % num_proto),
                # Note that we use the original dataset here, not the subsampled one!
                if prototype_policy=='random':
                    if verbose: print("random subset of the initial data.")
                    prototype_idx = np.random.permutation(data_original.shape[0])[:num_proto]
                    prototype = [data_original[i] for i in prototype_idx]
                elif prototype_policy=='fft':
                    prototype_idx = furthest_first_traversal(data_original, num_proto, distance)
                    prototype = [data_original[i] for i in prototype_idx]
                elif prototype_policy=='sff':
                    prototype_idx = subset_furthest_first(data_original, num_proto, distance)
                    prototype = [data_original[i] for i in prototype_idx]                
                else:
                    raise Exception                

                if verbose: print("Computing dissimilarity matrix.")
                data_dissimilarity = distance(data, prototype)

                if verbose: print("Computing distance matrix (dissimilarity space).")
                dissimilarity_distances = pdist(data_dissimilarity, metric='euclidean')

                rho[m,j,k] = correlation(original_distances, dissimilarity_distances)[0]
            print
    return rho
Beispiel #5
0
def scatter_correlation(df:pd.DataFrame=None, x:str=None, y:str=None,
                        aliasx:str=None,  aliasy:str=None,
                        text:object=None, color:object='#7ED388') -> (float, go.Figure) :
    '''
        Pearson correlation calculation and scatter plot unsing x and y
        if a dataframe is given, clean not numeric values and null values
        df       -> Pandas DataFrame
        x and y  -> If a dataframe is given, x and y are columns' names of this
                    If not dataframe is given, x and y are arrays
        aliasy and alias x -> text for the plot
        
        return
            correlation value and a Plotly Figure with the scatter plot
    '''
    if df is not None:
        if not aliasx:
            aliasx = x
        if not aliasy:
            aliasy = y
#         new_df = not_nulls(df, x, y)
#         new_df = new_df[new_df[x].apply(str_to_float)]
#         new_df = new_df[new_df[y].apply(str_to_float)]
        new_df = remove_not_numeric(df, x,y)
        x, y = new_df[x].astype('float64'), new_df[y].astype('float64')
    if df is None:
        if not aliasx:
            aliasx = 'x'
        if not aliasy:
            aliasy = 'y'
    if text is None:
        text=''
    figure = go.Figure(data=[go.Scatter(x=x, y=y, mode='markers', 
                                        text=text, marker_color=color)]
                      )
    figure.update_layout(
        title="Correlation of " + aliasx + ' and ' + aliasy,
        xaxis_title=aliasx,
        yaxis_title=aliasy,
    )
    corr, _ = correlation(x,y)
    return corr, figure
def compute_correlation(data, distance, prototype_policies, num_prototypes, iterations, verbose=False, size_limit=1000):
    global tracks_t,ids_l,data_original,tracks_s,id_t,tracks_n,a_ind, tracks_subsample
    print "Computing distance matrix and similarity matrix (original space):",
    data_original = data
    if data.shape[0] > size_limit:
        print
        print "Datset too big: subsampling to %s entries only!" % size_limit
        data = data[np.random.permutation(data.shape[0])[:size_limit], :]
    od = distance(data, data)     
    print od.shape
    original_distances = squareform(od)
    #original_distances2 = squareform(od)
    
    """
    my code 
    """
   
    affine=utils.affine_for_trackvis(voxel_size=np.array([2,2,2]))
    
    lengths = list(length(data_original)) 
    lengths=np.array(lengths)
    
    temp=np.where(lengths>10)[0]
    l=np.argsort(lengths)[::-1][:len(temp)]
    data_original_temp=data_original[l]
    a= streamline_mapping_new_step(data_original_temp, affine=affine)
    tracks_subsample=data_original_temp[a]
    
    print len(tracks_subsample)
   
    """
    my code 
    """ 
   
    rho = np.zeros((len(prototype_policies), len(num_prototypes),iterations))

    for m, prototype_policy in enumerate(prototype_policies):
        print prototype_policy
        for j, num_proto in enumerate(num_prototypes):
            print "number of prototypes:", num_proto, " - ", 
            for k in range(iterations):
                print k,
                stdout.flush()
                if verbose: print("Generating %s prototypes as" % num_proto),
                # Note that we use the original dataset here, not the subsampled one!
                if prototype_policy=='random':
                    if verbose: print("random subset of the initial data.")
                    prototype_idx = np.random.permutation(data_original.shape[0])[:num_proto]
                    prototype = [data_original[i] for i in prototype_idx]
                  
               
                elif prototype_policy=='sff':
                    prototype_idx = subset_furthest_first(data_original, num_proto, distance)
                    prototype = [data_original[i] for i in prototype_idx]  
                                 

                elif prototype_policy=='fft':
                    
                    prototype_idx = furthest_first_traversal( tracks_subsample, num_proto, distance)
                    prototype = [ tracks_subsample[i] for i in prototype_idx]
                
              
                else:
                    raise Exception                

                if verbose: print("Computing dissimilarity matrix.")
                 
                data_dissimilarity = distance(data, prototype)
                
                if verbose: print("Computing distance matrix (dissimilarity space).")
                dissimilarity_distances = pdist(data_dissimilarity, metric='euclidean')
               
                rho[m,j,k] = correlation(original_distances, dissimilarity_distances)[0]
            print
    return rho
Beispiel #7
0
def compute_correlation(data,
                        distance,
                        prototype_policies,
                        num_prototypes,
                        iterations,
                        verbose=False,
                        size_limit=1000):
    global tracks_t, ids_l, data_original, tracks_s, id_t, tracks_n, a_ind, tracks_subsample
    print "Computing distance matrix and similarity matrix (original space):",
    data_original = data
    if data.shape[0] > size_limit:
        print
        print "Datset too big: subsampling to %s entries only!" % size_limit
        data = data[np.random.permutation(data.shape[0])[:size_limit], :]
    od = distance(data, data)
    print od.shape
    original_distances = squareform(od)
    #original_distances2 = squareform(od)
    """
    my code 
    """

    affine = utils.affine_for_trackvis(voxel_size=np.array([2, 2, 2]))

    lengths = list(length(data_original))
    lengths = np.array(lengths)

    temp = np.where(lengths > 10)[0]
    l = np.argsort(lengths)[::-1][:len(temp)]
    data_original_temp = data_original[l]
    a = streamline_mapping_new_step(data_original_temp, affine=affine)
    tracks_subsample = data_original_temp[a]

    print len(tracks_subsample)
    """
    my code 
    """

    rho = np.zeros((len(prototype_policies), len(num_prototypes), iterations))

    for m, prototype_policy in enumerate(prototype_policies):
        print prototype_policy
        for j, num_proto in enumerate(num_prototypes):
            print "number of prototypes:", num_proto, " - ",
            for k in range(iterations):
                print k,
                stdout.flush()
                if verbose: print("Generating %s prototypes as" % num_proto),
                # Note that we use the original dataset here, not the subsampled one!
                if prototype_policy == 'random':
                    if verbose: print("random subset of the initial data.")
                    prototype_idx = np.random.permutation(
                        data_original.shape[0])[:num_proto]
                    prototype = [data_original[i] for i in prototype_idx]

                elif prototype_policy == 'sff':
                    prototype_idx = subset_furthest_first(
                        data_original, num_proto, distance)
                    prototype = [data_original[i] for i in prototype_idx]

                elif prototype_policy == 'fft':

                    prototype_idx = furthest_first_traversal(
                        tracks_subsample, num_proto, distance)
                    prototype = [tracks_subsample[i] for i in prototype_idx]

                else:
                    raise Exception

                if verbose: print("Computing dissimilarity matrix.")

                data_dissimilarity = distance(data, prototype)

                if verbose:
                    print("Computing distance matrix (dissimilarity space).")
                dissimilarity_distances = pdist(data_dissimilarity,
                                                metric='euclidean')

                rho[m, j, k] = correlation(original_distances,
                                           dissimilarity_distances)[0]
            print
    return rho
Beispiel #8
0
 def matrix_entry(i, j):
     return correlation(get_column(data, i), get_column(data, j))