Ejemplo n.º 1
0
    def __init__(self, hubs_num, sim_threshold, hub_type, \
        coupling, lambda0, a0, miu0, sigma0, sampling_method):

        self._graph = Graph(directed=False)
        self._graph.vertex_properties[
            "label"] = self._graph.new_vertex_property("string")
        self._graph.vertex_properties[
            "acqscore"] = self._graph.new_vertex_property("double")
        self._graph.vertex_properties[
            "freq"] = self._graph.new_vertex_property("int")

        self._graph.edge_properties[
            "distance"] = self._graph.new_edge_property("double")
        self._graph.edge_properties[
            "similarity"] = self._graph.new_edge_property("double")

        # Maps each word to the vertex object
        self._words_vertex_map = {}

        # Information/parameters about categories
        self._categories = {}
        self._words_token_freq = 0.0
        self._coupling = coupling
        self._lambda0 = lambda0
        self._a0 = a0
        self._miu0 = miu0
        self._sigma0 = sigma0
        self._sampling_method = sampling_method

        self._wnlabels = WordnetLabels()

        # Parameters
        # Number of hubs that are compared with each word
        self._hubs_num = hubs_num  #75
        # The similarity threshold for connecting two words
        self._sim_threshold = sim_threshold  #0.6
        self._hub_type = hub_type

        self.max_computations = []
        self.computations = []
        #The number of computations used in updating current edges
        self.update_computations = []
        #the number of new computations done
        self.new_computations = []

        # List to keep top nodes
        self._most_frequent_nodes = []

        self._highest_degree_nodes = []
Ejemplo n.º 2
0
    def __init__(self, hubs_num, sim_threshold, hub_type, \
        coupling, lambda0, a0, miu0, sigma0, sampling_method):

        self._graph = Graph(directed=False)
        self._graph.vertex_properties["label"] = self._graph.new_vertex_property("string")
        self._graph.vertex_properties["acqscore"] = self._graph.new_vertex_property("double")
        self._graph.vertex_properties["freq"] = self._graph.new_vertex_property("int")

        self._graph.edge_properties["distance"] = self._graph.new_edge_property("double")
        self._graph.edge_properties["similarity"] = self._graph.new_edge_property("double")

        # Maps each word to the vertex object
        self._words_vertex_map = {}

        # Information/parameters about categories
        self._categories = {}
        self._words_token_freq = 0.0
        self._coupling = coupling
        self._lambda0  = lambda0
        self._a0 = a0
        self._miu0 = miu0
        self._sigma0 = sigma0
        self._sampling_method = sampling_method
        
        self._wnlabels = WordnetLabels()

        # Parameters
        # Number of hubs that are compared with each word
        self._hubs_num = hubs_num #75
        # The similarity threshold for connecting two words
        self._sim_threshold = sim_threshold #0.6
        self._hub_type = hub_type


        self.max_computations = []
        self.computations = []
        #The number of computations used in updating current edges
        self.update_computations = []
        #the number of new computations done
        self.new_computations = []


        # List to keep top nodes
        self._most_frequent_nodes = []
        
        self._highest_degree_nodes = []
Ejemplo n.º 3
0
class WordsGraph():

    def __init__(self, hubs_num, sim_threshold, hub_type, \
        coupling, lambda0, a0, miu0, sigma0, sampling_method):

        self._graph = Graph(directed=False)
        self._graph.vertex_properties["label"] = self._graph.new_vertex_property("string")
        self._graph.vertex_properties["acqscore"] = self._graph.new_vertex_property("double")
        self._graph.vertex_properties["freq"] = self._graph.new_vertex_property("int")

        self._graph.edge_properties["distance"] = self._graph.new_edge_property("double")
        self._graph.edge_properties["similarity"] = self._graph.new_edge_property("double")

        # Maps each word to the vertex object
        self._words_vertex_map = {}

        # Information/parameters about categories
        self._categories = {}
        self._words_token_freq = 0.0
        self._coupling = coupling
        self._lambda0  = lambda0
        self._a0 = a0
        self._miu0 = miu0
        self._sigma0 = sigma0
        self._sampling_method = sampling_method
        
        self._wnlabels = WordnetLabels()

        # Parameters
        # Number of hubs that are compared with each word
        self._hubs_num = hubs_num #75
        # The similarity threshold for connecting two words
        self._sim_threshold = sim_threshold #0.6
        self._hub_type = hub_type


        self.max_computations = []
        self.computations = []
        #The number of computations used in updating current edges
        self.update_computations = []
        #the number of new computations done
        self.new_computations = []


        # List to keep top nodes
        self._most_frequent_nodes = []
        
        self._highest_degree_nodes = []
    
    
    def _select_features(self, word_features, N=15):
        '''Select N number of features that have high prob. from a word'''
        #should move to meaning
        sorted_features = []
        for feature in word_features:
            sorted_features.append([feature, word_features[feature]])
        sorted_features = sorted(sorted_features, key=itemgetter(1))

        selected_features = {}
        for feature, value in sorted_features[:N]:
            selected_features[feature] = value


        return selected_features


    def _calculate_similarity(self, word_features, other_word_features):
        ''' calculate simiarity between two words, given the specific features '''
        #should move to evaluate
        features = set(word_features.keys()) | set(other_word_features.keys())

        meaning1_vec = np.zeros(len(features))
        meaning2_vec = np.zeros(len(features))

        i = 0
        for feature in features:
            if word_features.has_key(feature):
                meaning1_vec[i] = word_features[feature]
            if other_word_features.has_key(feature):
                meaning2_vec[i] = other_word_features[feature]
            i += 1

        cos = np.dot(meaning1_vec, meaning2_vec)

        x = math.sqrt(np.dot(meaning1_vec, meaning1_vec))

        y = math.sqrt(np.dot(meaning2_vec, meaning2_vec))

        return  cos / (x * y)
    
    
    def calc_vertex_score(self, rel_freq, rel_degree, recency):
        if self._hub_type == "hub-freq":
            return rel_freq

        if self._hub_type == "hub-degree":
            return rel_degree

        if self._hub_type == "hub-recency":
            return recency

        if self._hub_type == "hub-freq-degree-recency":
            return rel_freq * rel_degree * recency

        if self._hub_type == "hub-freq-degree":
            return 0.5 * (rel_freq + rel_degree)
        
        '''
        sum_freq = 0.0
        for v in self._graph.vertices():
            sum_freq +=  self._graph.vertex_properties["freq"][v]

        sum_degree = float(sum(self._graph.degree_property_map("total").a))

        for v in self._graph.vertices():
            rel_freq = self._graph.vertex_properties["freq"][v] / sum_freq

            rel_degree = 0
            if sum_degree != 0:
                rel_degree = v.out_degree() / sum_degree

            word = self._graph.vertex_properties["label"][v]
            recency = 1.0 / (ctime - last_time[word] +   1)

            score = self.calc_vertex_score(rel_freq, rel_degree, recency)
            vertices.append([v, score])

        #print word, rel_freq,  recency, rel_degree, score

        vertices = sorted(vertices, key=itemgetter(1), reverse=True)

        return vertices[:hubs_num]
        '''

    def update_most_list(self, vertex, vertex_value, maximum_list, list_desired_size, t="deg"):
        '''  '''
        list_desired_size =  150 #TODO CHANGE
        
        #TODO sorting the list is not the most effient way to this -- change?
        #print vertex, vertex_value
        for i in range(len(maximum_list)):
            v = maximum_list[i][0]
            if  self._graph.vertex_properties["label"][v]  ==  self._graph.vertex_properties["label"][vertex] :
                if vertex_value < maximum_list[i][1] and t=="freq":
                    print "ERROR", self._graph.vertex_properties["label"][v], v, maximum_list[i][1], vertex_value
                
                maximum_list[i][1]= vertex_value
                maximum_list.sort(key=itemgetter(1)) 

                return 
        
        if len(maximum_list) < list_desired_size:
            maximum_list.append([vertex, vertex_value])
            maximum_list.sort(key=itemgetter(1))
        else:
            if vertex_value > maximum_list[0][1]:
                maximum_list[0][0]= vertex
                maximum_list[0][1]= vertex_value
                maximum_list.sort(key=itemgetter(1)) 

       # print maximum_list

    '''    
    if self._hub_type == "hub-degree-freq-context":
        for w in context:
            vertices.append([self._words_vertex_map[w], 1.])

        return self._highest_degree_nodes + self._most_frequent_nodes + vertices

    if self._hub_type == "hub-freq-context":
        for w in context:
            vertices.append([self._words_vertex_map[w], 1.])

        return self._most_frequent_nodes + vertices
    '''

    def select_hubs(self, context):
        vertices = []
        vert_num = len(self._words_vertex_map.keys())
        hubs_num = int(round(self._hubs_num * vert_num))


        if self._hub_type in ["hub-freq", "hub-freq-random"]:
            vertices = self._most_frequent_nodes[-1 * hubs_num:][:]

        if self._hub_type in ["hub-degree", "hub-degree-random"]:
            vertices = self._highest_degree_nodes[-1 * hubs_num:][:]
        
        if self._hub_type in ["hub-context", "hub-context-random", \
                "hub-categories-context", "hub-categories-prob-context"]:
            #hubs_num = self._hubs_num 
           
            selected_context = context
            if hubs_num  < len(context):
                selected_context = context[-1 * hubs_num:]
                
            for w in selected_context:
                vertices.append([self._words_vertex_map[w], 1.])

        if self._hub_type in ["hub-random", "hub-context-random", "hub-freq-random", "hub-degree-random",\
                "hub-categories-random", "hub-categories-prob-random"]:
            #hubs_num = self._hubs_num 

            indices = range(0, vert_num)
            if vert_num > hubs_num:
                indices = self.random_selection(hubs_num, 0, vert_num - 1)
            
            for index in indices:
                vertices.append([self._graph.vertex(index), 1.]) 

        return vertices
 


    def random_selection(self, num, min_range, max_range):
        selected = []
        used = set([])
        
        while len(selected) < num:
            rand_index = random.randint(min_range, max_range)
            if rand_index in used: continue
            used.add(rand_index)
            selected.append(rand_index)
        
        return selected

    def add_edge(self, word, other_word, word_m, other_word_m, beta, simtype):
        ''' Add an edge between the two given words, if their similarity is
        higher than a threshold'''

        if word == other_word:
            return False

        #sim = self.calculate_similarity(word_features, other_word_features)
        sim = evaluate.calculate_similarity(beta, word_m, other_word_m, simtype)


        # if the words are similar enough, connect them.
        # TODO this can be done probabilistically -- that is connect based on similarity
        if sim >= self._sim_threshold:

            vert = self._words_vertex_map[word]
            other_vert =  self._words_vertex_map[other_word]

            new_edge = self._graph.add_edge(vert, other_vert)
            self._graph.edge_properties["distance"][new_edge] = max(0, 1 - sim)
            self._graph.edge_properties["similarity"][new_edge] = sim

            #update the list of nodes with most degree
            
            self.update_most_list(vert, vert.out_degree(), self._highest_degree_nodes, self._hubs_num)
            self.update_most_list(other_vert, other_vert.out_degree(), self._highest_degree_nodes ,self._hubs_num)
            return True

        return False

    def evaluate_categories(self, filename):
        ''' This function use wordnet labels to calcualte precision & recall for created categories'''

        #Count for each label in all the categories
        labels_count = {}

        for category_id in self._categories.keys():
            category = self._categories[category_id]
            category_labels = {}
            category_words_count = 0

            # Count the frequency of each label type of words in this category and
            # the number of words in this category
            for word in category._words.keys():
                label = self._wnlabels.wordnet_label(word)

                #TODO We are considering words that do not have a wn-label as a single label
                #if label == CONST.NONE:
                #    continue

                if label not in category_labels:
                    category_labels[label] = 0

                # Add the frequecy of the word in the category
                category_labels[label] += category._words[word]
                category_words_count += category._words[word]

            #if len(labels) < 1:
            #    continue

            # This category's label is the most frequent of all the words' labels
            most_frequent_label = ""
            freq = 0

            print "category", category._id
            for label in category_labels:
                print "wn-label", label, category_labels[label], category_words_count

                if category_labels[label] > freq:
                    freq = category_labels[label]
                    most_frequent_label = label

                if not labels_count.has_key(label):
                    labels_count[label] = 0
                labels_count[label] += category_labels[label]

            category._label = most_frequent_label
            category._precision = float(freq) / category_words_count
            category._freq =  float(freq)
            print"----"

        statfile = open(filename + "categories.txt", 'a')
        #print
        all_precisions = []
        all_recalls = []
        for category_id in self._categories:
            category = self._categories[category_id]
            category._recall = category._freq / labels_count[category._label]
            print category._id, category._label,"freq", category._freq, np.sum(category._words.values()) ,'---precision', category._precision, "recall", category._recall
            all_precisions.append(category._precision)
            all_recalls.append(category._recall)
            statfile.write("id %s label %s freq %d precision %.2f recall %.2f \n" % \
                    (category._id, category._label, np.sum(category._words.values()), category._precision, category._recall))

        statfile.write("avg_precision %.2f avg_recall %.2f \n" % (np.mean(all_precisions), np.mean(all_recalls)))
        statfile.close()

    def pick_category(self, post_prob_k):

        # Find the category with max post prob
        if self._sampling_method =='map': #local MAP
            max_category_id = 1
            for category_id in post_prob_k:
                if post_prob_k[category_id] > post_prob_k[max_category_id]:
                    max_category_id = category_id
            return max_category_id

        elif self._sampling_method == 'spf': #single-particle particle filter
#            print self._sampling_method
            rand = random.random()
            min_range = 0
            denom = logsumexp(post_prob_k.values())

            for category_id in post_prob_k:
                ppk = math.exp(post_prob_k[category_id] - denom)
                if min_range <= rand < ppk + min_range:
                    return category_id
                min_range += ppk

    def select_words_from_categ(self, post_prob_k):
            denom = logsumexp(post_prob_k.values())
            selected_words = set([])

            vert_num = len(self._words_vertex_map.keys())
            #hubs_num = round(self._hubs_num * vert_num)
            
            # TODO changed from round to ceil June 5
            hubs_num = np.ceil(self._hubs_num * vert_num)


            for category_id in self._categories:
                ppk = math.exp(post_prob_k[category_id] - denom)
                select_words_num = round(hubs_num * ppk)

                categ_words = self._categories[category_id]._words.keys()[:]
                indices = range(0, len(categ_words))
                if len(categ_words) > select_words_num:
                    indices = self.random_selection(select_words_num, 0, len(categ_words) - 1)
                
                for index in indices:
                    selected_words.add(categ_words[index])
            
            #print len(selected_words)
            return selected_words

    def _add_word_to_category(self, word, word_m):#, lexicon, marked, beta, simtype):
        ''' n is token size not type size'''

        post_prob_k = {} #P(K|W), where K is the category

        for category_id in self._categories:
            category = self._categories[category_id]
            post_prob_k[category_id] = category.posterior(word, word_m._meaning_probs, self._words_token_freq)
            #post_prob_k[category_id] = category.posterior(word, word_top_features, self._words_token_freq)


        new_category = Category(len(self._categories) + 1, self._coupling, self._lambda0, self._a0, \
        self._miu0, self._sigma0)
        post_prob_k[new_category._id] = new_category.posterior(word, word_m._meaning_probs, self._words_token_freq)
#        post_prob_k[new_category._id] = new_category.posterior(word, word_top_features, self._words_token_freq)


        selected_category_id = self.pick_category(post_prob_k)

        # Add the new category
        if selected_category_id == len(self._categories) + 1:
            self._categories[len(self._categories) + 1] = new_category

        # Add the word to the chosen category
        self._categories[selected_category_id].add_word(word, word_m._meaning_probs)
        self._words_token_freq += 1
        #print word, selected_category_id

        selected_words = []
        # Pick x number of words from each category proportional to p(k|f)
        if self._hub_type.startswith("hub-categories-prob"):
            selected_words = self.select_words_from_categ(post_prob_k)
            
        else:
            categ_words = self._categories[selected_category_id]._words.keys()[:]
            categ_words_num = len(categ_words)

            # when hub-type == hub-categories
            selected_words =  categ_words[:]
            
            if self._hub_type in ["hub-categories-context", "hub-categories-random", "hub-categories-partial"]: 
                vert_num = len(self._words_vertex_map.keys())
                hubs_num = round(self._hubs_num * vert_num)
                #hubs_num = self._hubs_num 

                if categ_words_num > hubs_num:
                    indices = self.random_selection(hubs_num, 0, categ_words_num -1)
                    selected_words = []
                    for index in indices:
                        selected_words.append(categ_words[index])


        categ_hubs = []
        for oth_word in selected_words:
            oth_node = self._words_vertex_map[oth_word]
            categ_hubs.append([oth_node, 1])
            
        return categ_hubs



    def add_word(self, context, word, acq_score, lexicon, last_time, ctime, beta, simtype):
        ''' add a new word to the graph or update its connections '''

        marked = set([]) # Mark vertices that already visited
        word_m = lexicon.meaning(word)
#        word_top_features =self.select_features(word_m._meaning_probs)

        # add the word to the graph
        if not word in self._words_vertex_map:
            self._words_vertex_map[word] = self._graph.add_vertex()
            self._graph.vertex_properties["label"][self._words_vertex_map[word]] = word
            self._graph.vertex_properties["freq"][self._words_vertex_map[word]] = 0
            
            if len(self._highest_degree_nodes) < self._hubs_num:
                self._highest_degree_nodes.append([self._words_vertex_map[word], 0])


        # if the words was in the graph, update its connections 
        #TODO Investigate if we need to do this.
        else:
            vertex = self._words_vertex_map[word]
            edges = list(vertex.out_edges())
            for edge in edges:
                target_w =  self._graph.vertex_properties["label"][edge.target()]

                if target_w == word:
                    target_w = self._graph.vertex_properties["label"][edge.source()]
                    print "ERROR"

                target_w_m = lexicon.meaning(target_w)
                #target_w_top_features = self.select_features(target_w_m._meaning_probs)

                marked.add(self._words_vertex_map[target_w])

                self.add_edge(word, target_w, word_m, target_w_m, beta, simtype)
                self._graph.remove_edge(edge)

            self.update_computations.append(len(marked))

        vert = self._words_vertex_map[word]
        self._graph.vertex_properties["acqscore"][vert] = acq_score

        self._graph.vertex_properties["freq"][vert] = \
        self._graph.vertex_properties["freq"][vert] + 1
        
        self.update_most_list(vert, self._graph.vertex_properties["freq"][vert], self._most_frequent_nodes, self._hubs_num, "freq")

        categ_hubs = []
        hubs = []
        '''
        vert_num = len(self._words_vertex_map.keys())
        #number of comparisons
        hubs_num = int(round(self._hubs_num * vert_num))
        #deduct the number of used comparisons, ie, # of current edges that are updated.
        hubs_num -= len(marked) 
        if not (hubs_num in ["hub-categories", "hub-categories-prob", \
                "hub-categories-partial", "hub-context", "hub-random",\
                "hub-freq", "hub-degree"]):
            hubs_num  = hubs_num // 2
        '''

        if self._hub_type.startswith("hub-categories"):
            categ_hubs = self._add_word_to_category(word, word_m)#, lexicon, marked, beta, simtype) #TODO
       
        if not  (self._hub_type in ["hub-categories", "hub-categories-partial", "hub-categories-prob"]):
            hubs = self.select_hubs(context)
        
#        print word 

        if self._hub_type in ["hub-random", "hub-context", "hub-context-random",\
                "hub-degree", "hub-degree-random", "hub-freq", "hub-freq-random"] \
                or self._hub_type.startswith("hub-categories"):
            # "hub-categories", "hub-categories-context", "hub-categories-random", "hub-categories-partial"]:
            update_num = 0
            # calculate similarity of the word and hub
            for hub, score in (hubs + categ_hubs):
                if hub in marked: continue
                marked.add(hub)

                hword = self._graph.vertex_properties["label"][hub]
                hword_m = lexicon.meaning(hword)
                edge_added = self.add_edge(word, hword, word_m, hword_m, beta, simtype)
                update_num +=1
            
            self.new_computations.append(update_num)
        '''               
        #TODO WE ARE NOT USING THIS
        else:
            # calculate similarity of the word and hub
            for hub, score in hubs:
                if hub in marked: continue
                marked.add(hub)

                hword = self._graph.vertex_properties["label"][hub]
                hword_m = lexicon.meaning(hword)

                edge_added = self.add_edge(word, hword, word_m, hword_m, beta, simtype)
                if not edge_added: continue

                for neighbor in hub.all_neighbours():
                    if neighbor in marked: continue
                    marked.add(neighbor)
                    neighbor_word = self._graph.vertex_properties["label"][neighbor]
                    nword_m = lexicon.meaning(word)
                    
                    self.add_edge(word, neighbor_word, word_m, nword_m, beta, simtype)
        '''
        # calculate the number of computations
        self.max_computations.append(self._graph.num_vertices())
        
        #print "number of computations" ,  len(marked)
        
        self.computations.append(len(marked))


    def plot(self, graph, filename):
        """ Plot a graph """
#        ebet = betweenness(graph)[1]
        name = graph.vertex_properties["label"]
#        acq_scores = graph.vertex_properties["acqscore"]
        distances = graph.edge_properties["distance"]
        deg = graph.degree_property_map("total")
        pos = sfdp_layout(graph)
        #arf_layout(graph)
        graph_draw(graph, pos= pos, vertex_text=name, vertex_font_size=12, vertex_fill_color= deg, vorder=deg,\
        edge_pen_width=distances, output=filename + "graph.png", output_size=(3000,3000), nodesfirst=False)

    def print_hubs(self, filename, last_time, ctime):
        """ Print hubs of the graph """

        hubs = self.select_hubs([])
        stat_file = open(filename, 'a')
        stat_file.write("\nThe final hubs of the semantic network:\n")
        st = ""
        if hubs != None:
            for hub,score in hubs:
                st +=  self._graph.vertex_properties["label"][hub] + ","
            stat_file.write(st + "\n")
        stat_file.close()

    def print_computations(self, filename):

        stat_file = open(filename, 'a')
        (avg, std) = np.mean(self.max_computations), np.std(self.max_computations)
        stat_file.write("\navg maximum computations over words:" + "%.2f +/- %.2f" % (avg, std) + "\n")

        (avg, std) = np.mean(self.computations), np.std(self.computations)
        stat_file.write("avg actual computations over words:" + "%.2f +/- %.2f" % (avg, std) + "\n")

        (avg, std) = np.mean(self.update_computations), np.std(self.update_computations)
        stat_file.write("avg update computations over words:" + "%.2f +/- %.2f" % (avg, std) + "\n")

        (avg, std) = np.mean(self.new_computations), np.std(self.new_computations)
        stat_file.write("avg new computations over words:" + "%.2f +/- %.2f" % (avg, std) + "\n")

        stat_file.close()


    def calc_distances(self, graph):
        distance = {}
        for v in graph.vertices():
            w = graph.vertex_properties["label"][v]
            if not w in distance.keys():
                distance[w] = {}

            distmap = shortest_distance(graph, v, weights=graph.edge_properties["distance"]) #TODO
            for othv in graph.vertices():
                othw = graph.vertex_properties["label"][othv]
                if othw == w:
                    continue
                distance[w][othw] = distmap[othv]
        return distance

    def calc_graph_ranks(self, graph_distances ):
        # Rank the targets for each cue in the graph
        graph_ranks = {}
        for cue in graph_distances:
            graph_ranks[cue] = {}

            sorted_targets = []
            for target in graph_distances[cue]:
                sorted_targets.append([target, graph_distances[cue][target]])
            sorted_targets = sorted(sorted_targets, key=itemgetter(1))

            max_rank = 100000
            for ind in range(len(sorted_targets)):
                if sorted_targets[ind][1] == sys.float_info.max or sorted_targets[ind][1] == 2147483647:
                    rank = max_rank
                else:
                    rank = ind + 1

                graph_ranks[cue][sorted_targets[ind][0]] = rank

        return graph_ranks

    def calc_correlations(self, gold_sim, distances, consider_notconnected):
        print "calc_correlations"
        graph_pairs, gold_pairs = [], []
        not_connected = 0
        all_pairs = 0.0


        for cue in gold_sim:
            for target, score in gold_sim[cue]:
                all_pairs += 1

                if distances[cue][target] ==  sys.float_info.max or \
                    distances[cue][target]== 2147483647:
                    not_connected += 1

                    #print cue, target, score, distances[cue][target]

                    if not consider_notconnected:
                        continue

                gold_pairs.append(score)  #TODO sim vs distance
                graph_pairs.append(distances[cue][target])
        print "--------------------"
        if len(graph_pairs) == 0:
            print "nothing matched"
            return (0.0,0.0), (0.0, 0.0), 0.0

        #pearson_r, pearson_pvalue
        pearson = scipy.stats.pearsonr(gold_pairs, graph_pairs)
        #spearman_t, spearman_pvalue
        spearman = scipy.stats.spearmanr(gold_pairs, graph_pairs)

        print "not connected", not_connected, all_pairs
        return pearson, spearman, not_connected/all_pairs


    def calc_median_rank(self, gold_sims, graph_ranks):
        """ calculate the median rank of the first five associates """

        ranks = {}
        for r in range(5):
            ranks[r] = []

        for cue in gold_sims:
            for index in range(min(len(gold_sims[cue]), 5)):
                target = gold_sims[cue][index][0]
                target_rank = graph_ranks[cue][target]
                ranks[index].append(target_rank)

        return ranks



    def evaluate_semantics(self, graph_distances, graph_ranks,  gold_sim, filename, gold_name):


        ranks = self.calc_median_rank(gold_sim, graph_ranks)
        for index in ranks:
            print ranks[index]
        
        stat_file = open(filename, 'a')
        stat_file.write("evaluation using " + gold_name + "\n")
        stat_file.write("median rank of first five associates for " + str(len(gold_sim.keys())) + " cues\n")
        for i in range(len(ranks.keys())):
            #print ranks[i], numpy.median(ranks[i])
            stat_file.write(str(i+1) + " associate. number of cue-target pairs: %d" % len(ranks[i]) +\
            " median rank: %.2f" %  np.median(ranks[i])+"\n")

        # Calc correlations
        pearson, spearman, not_connected = self.calc_correlations(gold_sim, graph_distances, False)
        stat_file.write("\n Not considering pairs that are not connected in the graph\n")
        stat_file.write("pearson  correlation %.2f p-value %.2f" % pearson + "\n")
        stat_file.write("spearman correlation %.2f p-value %.2f" % spearman + "\n")
        stat_file.write("cue-target pairs that are not_connected in the graph %.2f" % not_connected + "\n\n")

        pearson, spearman, not_connected = self.calc_correlations(gold_sim, graph_distances, True)
        stat_file.write("Considering pairs that are not connected in the graph\n")
        stat_file.write("pearson  correlation %.2f p-value %.2f" % pearson + "\n")
        stat_file.write("spearman correlation %.2f p-value %.2f" % spearman + "\n")
        stat_file.write("cue-target pairs that are not_connected in the graph %.2f" % not_connected + "\n\n")

    def evaluate(self, last_time, current_time, gold_lexicon, learned_lexicon, beta, simtype, data_path, filename):
        words = self._words_vertex_map.keys()

        #gold_graph = self.create_final_graph(words, gold_lexicon, beta, simtype)
        #learned_graph = self.create_final_graph(words, learned_lexicon, beta, simtype)
        
        grown_graph = self._graph

        if self._hub_type != "hub-categories":
            self.print_hubs(filename + "_grown.txt", last_time, current_time) #CHECK

        self.print_computations(filename + "_grown.txt") #CHECK

        
        #nelson_norms = process_norms(data_path +"/norms/", words)
        #wn_jcn_sims = wordnet_similarity(words, "jcn", self._wnlabels)
        wn_wup_sims = wordnet_similarity(words, "wup", self._wnlabels)
        #wn_path_sims = wordnet_similarity(words, "path",self._wnlabels)

        #rg_norms = process_rg_norms(data_path+"/Rubenstein-Goodenough.txt", words)
        #wordsims353_norms = process_rg_norms(data_path + "/wordsim353/combined.tab",words)
        
        for g, tag in [[grown_graph, "_grown"]]:#, [gold_graph, "_gold"], [learned_graph, "_learned"]]:
        #    self.plot(g, filename + tag + "_")
            self.calc_small_worldness(g, filename + tag)

            
            distances = self.calc_distances(g)
            graph_ranks = self.calc_graph_ranks(distances)

            self.evaluate_semantics(distances, graph_ranks, wn_wup_sims, filename + tag + ".txt", "wordnet using WUP sim measure")
        
        #    self.evaluate_semantics(distances, graph_ranks, nelson_norms, filename + tag + ".txt", "Nelson norms")
        #    self.evaluate_semantics(distances, graph_ranks, wn_jcn_sims, filename + tag + ".txt", "wordnet using JCN sim measure")
        #    self.evaluate_semantics(distances, graph_ranks, wn_path_sims, filename + tag + ".txt", "wordnet using Path sim measure")
        #    self.evaluate_semantics(distances, graph_ranks, rg_norms, filename + tag + ".txt", "Rubenstein-Goodenough norms")

        #    self.evaluate_semantics(distances, graph_ranks, wordsims353_norms, filename + tag + ".txt", "Wordsim353 norms")
            

    def calc_small_worldness(self, graph, filename):
        avg_c, median_sp = self.calc_graph_stats(graph, filename)

        rand_graph = Graph(graph)
        rejection_count = random_rewire(rand_graph, "erdos")
        print "rejection count", rejection_count
        rand_avg_c, rand_median_sp = self.calc_graph_stats(rand_graph, filename)

        stat_file = open(filename + ".txt", 'a')
        stat_file.write("small-worldness %.3f" % ((avg_c / rand_avg_c)/(float(median_sp)/rand_median_sp)) + "\n\n")
        stat_file.close()

    def calc_graph_stats(self, graph, filename):
        """ calc graph stats """

        """Average Local Clustering Coefficient"""
        local_clust_co = local_clustering(graph)
        avg_local_clust = vertex_average(graph, local_clust_co)

        """Average Degree (sparsity)"""
        avg_total_degree = vertex_average(graph, "total")

        nodes_num = graph.num_vertices()
        edges_num = graph.num_edges()

        """ Largest Component of the Graph"""
        lc_labels = label_largest_component(graph)

        lc_graph = Graph(graph)
        lc_graph.set_vertex_filter(lc_labels)
        lc_graph.purge_vertices()

        """Average Shortest Path in LCC"""
        lc_distances = lc_graph.edge_properties["distance"]
        dist = shortest_distance(lc_graph)#, weights=lc_distances) #TODO
        dist_list = []
        for v in lc_graph.vertices():
            dist_list += list(dist[v].a)


        """ Median Shortest Path """
        distances = graph.edge_properties["distance"] #TODO
        gdist = shortest_distance(graph)#, weights=distances)
        graph_dists = []
        counter = 0
        for v in graph.vertices():
            for othv in gdist[v].a:
                if othv != 0.0: # not to include the distance to the node
                    graph_dists.append(othv)
                else:
                    counter +=1
      #  print "num v", graph.num_vertices(), counter
        median_sp = np.median(graph_dists)
      #  print "median", median_sp#, graph_dists


        stat_file = open(filename + ".txt", 'a')
        stat_file.write("number of nodes:"+ str(nodes_num) + "\nnumber of edges:" + str(edges_num) + "\n")
        stat_file.write("avg total degree:" + "%.2f +/- %.2f" % avg_total_degree  + "\n")
        stat_file.write("sparsity:" + "%.2f" % (avg_total_degree[0] / float(nodes_num))  + "\n")

        stat_file.write("number of nodes in LLC:"+  str(lc_graph.num_vertices()) + "\nnumber of edges in LLC:" + str(lc_graph.num_edges()) + "\n")
        stat_file.write("connectedness:" + "%.2f" % (lc_graph.num_vertices()/float(nodes_num)) + "\n")
        stat_file.write("avg distance in LCC:" + "%.2f +/- %.2f" % (np.mean(dist_list), np.std(dist_list)) + "\n\n")

        stat_file.write("avg local clustering coefficient:" + "%.2f +/- %.2f" % avg_local_clust + "\n")
        stat_file.write("median distnace in graph:" + "%.2f" % median_sp + "\n\n")

       # Plotting the degree distribution
        ''' 
        plt.clf()
        hist = vertex_hist(graph, "total")
        prob_hist = []
        sum_hist = sum(hist[0])
        for h in hist[0]:
            prob_hist.append(h/float(sum_hist))

        plt.plot(hist[1][1:], prob_hist, 'ro')#, normed=False, facecolor='green', alpha=0.5)
        plt.xlabel('K')
        plt.gca().set_yscale("log")
        plt.gca().set_xscale("log")
        plt.ylabel('P(K)')
        #plt.title(r'Histogram of degrees of the graph')
        #data_1 = graph.degree_property_map("total").a#, graph.degree_property_map, len( graph.degree_property_map("total").a)
        #fit = powerlaw.Fit(data_1) TODO
        #stat_file.write("alpha of powerlaw " + str(fit.power_law.alpha) + "\n\n")
        #print fit.power_law.xmin
        #fit.plot_pdf(ax=plt.gca(),  linewidth=3, color='b')
        #fit.power_law.plot_pdf(ax=plt.gca(), color='g', linestyle='--')
        plt.savefig(filename + "_loglog_degree_histogram.png")
        
        plt.clf()
        plt.plot(hist[1][1:], prob_hist, 'ro')#, normed=False, facecolor='green', alpha=0.5)
        plt.xlabel('K')
        plt.ylabel('P(K)')
        #
        plt.savefig(filename + "_degree_histogram.png")
        '''
        
        stat_file.close()

        return avg_local_clust[0], median_sp

    def create_final_graph(self, words, lexicon, beta, simtype):
        """ create a graph, given a set of words and their meanings """

        graph = Graph(directed=False)
        graph.vertex_properties["label"] = graph.new_vertex_property("string")
        graph.edge_properties["distance"]  = graph.new_edge_property("double")
        graph.vertex_properties["acqscore"] = graph.new_vertex_property("double")


        word_vertex_map = {}

        for word in words:
            word_vertex_map[word] = graph.add_vertex()
            graph.vertex_properties["label"][word_vertex_map[word]] = word


        for word in words:
            for otherword in words:
                if word == otherword:
                    continue

                vert = word_vertex_map[word]
                othervert =  word_vertex_map[otherword]

                if graph.edge(vert, othervert) != None or graph.edge(othervert, vert)!= None:
                    continue

                word_m = lexicon.meaning(word)
#                word_m_top_features = self.select_features(word_m._meaning_probs)

                otherword_m = lexicon.meaning(otherword)
#                otherword_m_top_features = self.select_features(otherword_m._meaning_probs)


                #sim = self.calculate_similarity(word_m_top_features, otherword_m_top_features)
                sim = evaluate.calculate_similarity(beta, word_m, otherword_m, simtype)

                if sim >= self._sim_threshold:
                    new_edge = graph.add_edge(vert, othervert)
                    graph.edge_properties["distance"][new_edge] = max(0, 1 - sim ) #distance #TODO

        return graph
Ejemplo n.º 4
0
class WordsGraph():

    def __init__(self, hubs_num, sim_threshold, hub_type, \
        coupling, lambda0, a0, miu0, sigma0, sampling_method):

        self._graph = Graph(directed=False)
        self._graph.vertex_properties[
            "label"] = self._graph.new_vertex_property("string")
        self._graph.vertex_properties[
            "acqscore"] = self._graph.new_vertex_property("double")
        self._graph.vertex_properties[
            "freq"] = self._graph.new_vertex_property("int")

        self._graph.edge_properties[
            "distance"] = self._graph.new_edge_property("double")
        self._graph.edge_properties[
            "similarity"] = self._graph.new_edge_property("double")

        # Maps each word to the vertex object
        self._words_vertex_map = {}

        # Information/parameters about categories
        self._categories = {}
        self._words_token_freq = 0.0
        self._coupling = coupling
        self._lambda0 = lambda0
        self._a0 = a0
        self._miu0 = miu0
        self._sigma0 = sigma0
        self._sampling_method = sampling_method

        self._wnlabels = WordnetLabels()

        # Parameters
        # Number of hubs that are compared with each word
        self._hubs_num = hubs_num  #75
        # The similarity threshold for connecting two words
        self._sim_threshold = sim_threshold  #0.6
        self._hub_type = hub_type

        self.max_computations = []
        self.computations = []
        #The number of computations used in updating current edges
        self.update_computations = []
        #the number of new computations done
        self.new_computations = []

        # List to keep top nodes
        self._most_frequent_nodes = []

        self._highest_degree_nodes = []

    def _select_features(self, word_features, N=15):
        '''Select N number of features that have high prob. from a word'''
        #should move to meaning
        sorted_features = []
        for feature in word_features:
            sorted_features.append([feature, word_features[feature]])
        sorted_features = sorted(sorted_features, key=itemgetter(1))

        selected_features = {}
        for feature, value in sorted_features[:N]:
            selected_features[feature] = value

        return selected_features

    def _calculate_similarity(self, word_features, other_word_features):
        ''' calculate simiarity between two words, given the specific features '''
        #should move to evaluate
        features = set(word_features.keys()) | set(other_word_features.keys())

        meaning1_vec = np.zeros(len(features))
        meaning2_vec = np.zeros(len(features))

        i = 0
        for feature in features:
            if word_features.has_key(feature):
                meaning1_vec[i] = word_features[feature]
            if other_word_features.has_key(feature):
                meaning2_vec[i] = other_word_features[feature]
            i += 1

        cos = np.dot(meaning1_vec, meaning2_vec)

        x = math.sqrt(np.dot(meaning1_vec, meaning1_vec))

        y = math.sqrt(np.dot(meaning2_vec, meaning2_vec))

        return cos / (x * y)

    def calc_vertex_score(self, rel_freq, rel_degree, recency):
        if self._hub_type == "hub-freq":
            return rel_freq

        if self._hub_type == "hub-degree":
            return rel_degree

        if self._hub_type == "hub-recency":
            return recency

        if self._hub_type == "hub-freq-degree-recency":
            return rel_freq * rel_degree * recency

        if self._hub_type == "hub-freq-degree":
            return 0.5 * (rel_freq + rel_degree)
        '''
        sum_freq = 0.0
        for v in self._graph.vertices():
            sum_freq +=  self._graph.vertex_properties["freq"][v]

        sum_degree = float(sum(self._graph.degree_property_map("total").a))

        for v in self._graph.vertices():
            rel_freq = self._graph.vertex_properties["freq"][v] / sum_freq

            rel_degree = 0
            if sum_degree != 0:
                rel_degree = v.out_degree() / sum_degree

            word = self._graph.vertex_properties["label"][v]
            recency = 1.0 / (ctime - last_time[word] +   1)

            score = self.calc_vertex_score(rel_freq, rel_degree, recency)
            vertices.append([v, score])

        #print word, rel_freq,  recency, rel_degree, score

        vertices = sorted(vertices, key=itemgetter(1), reverse=True)

        return vertices[:hubs_num]
        '''

    def update_most_list(self,
                         vertex,
                         vertex_value,
                         maximum_list,
                         list_desired_size,
                         t="deg"):
        '''  '''
        list_desired_size = 150  #TODO CHANGE

        #TODO sorting the list is not the most effient way to this -- change?
        #print vertex, vertex_value
        for i in range(len(maximum_list)):
            v = maximum_list[i][0]
            if self._graph.vertex_properties["label"][
                    v] == self._graph.vertex_properties["label"][vertex]:
                if vertex_value < maximum_list[i][1] and t == "freq":
                    print "ERROR", self._graph.vertex_properties["label"][
                        v], v, maximum_list[i][1], vertex_value

                maximum_list[i][1] = vertex_value
                maximum_list.sort(key=itemgetter(1))

                return

        if len(maximum_list) < list_desired_size:
            maximum_list.append([vertex, vertex_value])
            maximum_list.sort(key=itemgetter(1))
        else:
            if vertex_value > maximum_list[0][1]:
                maximum_list[0][0] = vertex
                maximum_list[0][1] = vertex_value
                maximum_list.sort(key=itemgetter(1))

    # print maximum_list
    '''    
    if self._hub_type == "hub-degree-freq-context":
        for w in context:
            vertices.append([self._words_vertex_map[w], 1.])

        return self._highest_degree_nodes + self._most_frequent_nodes + vertices

    if self._hub_type == "hub-freq-context":
        for w in context:
            vertices.append([self._words_vertex_map[w], 1.])

        return self._most_frequent_nodes + vertices
    '''

    def select_hubs(self, context):
        vertices = []
        vert_num = len(self._words_vertex_map.keys())
        hubs_num = int(round(self._hubs_num * vert_num))

        if self._hub_type in ["hub-freq", "hub-freq-random"]:
            vertices = self._most_frequent_nodes[-1 * hubs_num:][:]

        if self._hub_type in ["hub-degree", "hub-degree-random"]:
            vertices = self._highest_degree_nodes[-1 * hubs_num:][:]

        if self._hub_type in ["hub-context", "hub-context-random", \
                "hub-categories-context", "hub-categories-prob-context"]:
            #hubs_num = self._hubs_num

            selected_context = context
            if hubs_num < len(context):
                selected_context = context[-1 * hubs_num:]

            for w in selected_context:
                vertices.append([self._words_vertex_map[w], 1.])

        if self._hub_type in ["hub-random", "hub-context-random", "hub-freq-random", "hub-degree-random",\
                "hub-categories-random", "hub-categories-prob-random"]:
            #hubs_num = self._hubs_num

            indices = range(0, vert_num)
            if vert_num > hubs_num:
                indices = self.random_selection(hubs_num, 0, vert_num - 1)

            for index in indices:
                vertices.append([self._graph.vertex(index), 1.])

        return vertices

    def random_selection(self, num, min_range, max_range):
        selected = []
        used = set([])

        while len(selected) < num:
            rand_index = random.randint(min_range, max_range)
            if rand_index in used: continue
            used.add(rand_index)
            selected.append(rand_index)

        return selected

    def add_edge(self, word, other_word, word_m, other_word_m, beta, simtype):
        ''' Add an edge between the two given words, if their similarity is
        higher than a threshold'''

        if word == other_word:
            return False

        #sim = self.calculate_similarity(word_features, other_word_features)
        sim = evaluate.calculate_similarity(beta, word_m, other_word_m,
                                            simtype)

        # if the words are similar enough, connect them.
        # TODO this can be done probabilistically -- that is connect based on similarity
        if sim >= self._sim_threshold:

            vert = self._words_vertex_map[word]
            other_vert = self._words_vertex_map[other_word]

            new_edge = self._graph.add_edge(vert, other_vert)
            self._graph.edge_properties["distance"][new_edge] = max(0, 1 - sim)
            self._graph.edge_properties["similarity"][new_edge] = sim

            #update the list of nodes with most degree

            self.update_most_list(vert, vert.out_degree(),
                                  self._highest_degree_nodes, self._hubs_num)
            self.update_most_list(other_vert, other_vert.out_degree(),
                                  self._highest_degree_nodes, self._hubs_num)
            return True

        return False

    def evaluate_categories(self, filename):
        ''' This function use wordnet labels to calcualte precision & recall for created categories'''

        #Count for each label in all the categories
        labels_count = {}

        for category_id in self._categories.keys():
            category = self._categories[category_id]
            category_labels = {}
            category_words_count = 0

            # Count the frequency of each label type of words in this category and
            # the number of words in this category
            for word in category._words.keys():
                label = self._wnlabels.wordnet_label(word)

                #TODO We are considering words that do not have a wn-label as a single label
                #if label == CONST.NONE:
                #    continue

                if label not in category_labels:
                    category_labels[label] = 0

                # Add the frequecy of the word in the category
                category_labels[label] += category._words[word]
                category_words_count += category._words[word]

            #if len(labels) < 1:
            #    continue

            # This category's label is the most frequent of all the words' labels
            most_frequent_label = ""
            freq = 0

            print "category", category._id
            for label in category_labels:
                print "wn-label", label, category_labels[
                    label], category_words_count

                if category_labels[label] > freq:
                    freq = category_labels[label]
                    most_frequent_label = label

                if not labels_count.has_key(label):
                    labels_count[label] = 0
                labels_count[label] += category_labels[label]

            category._label = most_frequent_label
            category._precision = float(freq) / category_words_count
            category._freq = float(freq)
            print "----"

        statfile = open(filename + "categories.txt", 'a')
        #print
        all_precisions = []
        all_recalls = []
        for category_id in self._categories:
            category = self._categories[category_id]
            category._recall = category._freq / labels_count[category._label]
            print category._id, category._label, "freq", category._freq, np.sum(
                category._words.values()
            ), '---precision', category._precision, "recall", category._recall
            all_precisions.append(category._precision)
            all_recalls.append(category._recall)
            statfile.write("id %s label %s freq %d precision %.2f recall %.2f \n" % \
                    (category._id, category._label, np.sum(category._words.values()), category._precision, category._recall))

        statfile.write("avg_precision %.2f avg_recall %.2f \n" %
                       (np.mean(all_precisions), np.mean(all_recalls)))
        statfile.close()

    def pick_category(self, post_prob_k):

        # Find the category with max post prob
        if self._sampling_method == 'map':  #local MAP
            max_category_id = 1
            for category_id in post_prob_k:
                if post_prob_k[category_id] > post_prob_k[max_category_id]:
                    max_category_id = category_id
            return max_category_id

        elif self._sampling_method == 'spf':  #single-particle particle filter
            #            print self._sampling_method
            rand = random.random()
            min_range = 0
            denom = logsumexp(post_prob_k.values())

            for category_id in post_prob_k:
                ppk = math.exp(post_prob_k[category_id] - denom)
                if min_range <= rand < ppk + min_range:
                    return category_id
                min_range += ppk

    def select_words_from_categ(self, post_prob_k):
        denom = logsumexp(post_prob_k.values())
        selected_words = set([])

        vert_num = len(self._words_vertex_map.keys())
        #hubs_num = round(self._hubs_num * vert_num)

        # TODO changed from round to ceil June 5
        hubs_num = np.ceil(self._hubs_num * vert_num)

        for category_id in self._categories:
            ppk = math.exp(post_prob_k[category_id] - denom)
            select_words_num = round(hubs_num * ppk)

            categ_words = self._categories[category_id]._words.keys()[:]
            indices = range(0, len(categ_words))
            if len(categ_words) > select_words_num:
                indices = self.random_selection(select_words_num, 0,
                                                len(categ_words) - 1)

            for index in indices:
                selected_words.add(categ_words[index])

        #print len(selected_words)
        return selected_words

    def _add_word_to_category(self, word,
                              word_m):  #, lexicon, marked, beta, simtype):
        ''' n is token size not type size'''

        post_prob_k = {}  #P(K|W), where K is the category

        for category_id in self._categories:
            category = self._categories[category_id]
            post_prob_k[category_id] = category.posterior(
                word, word_m._meaning_probs, self._words_token_freq)
            #post_prob_k[category_id] = category.posterior(word, word_top_features, self._words_token_freq)


        new_category = Category(len(self._categories) + 1, self._coupling, self._lambda0, self._a0, \
        self._miu0, self._sigma0)
        post_prob_k[new_category._id] = new_category.posterior(
            word, word_m._meaning_probs, self._words_token_freq)
        #        post_prob_k[new_category._id] = new_category.posterior(word, word_top_features, self._words_token_freq)

        selected_category_id = self.pick_category(post_prob_k)

        # Add the new category
        if selected_category_id == len(self._categories) + 1:
            self._categories[len(self._categories) + 1] = new_category

        # Add the word to the chosen category
        self._categories[selected_category_id].add_word(
            word, word_m._meaning_probs)
        self._words_token_freq += 1
        #print word, selected_category_id

        selected_words = []
        # Pick x number of words from each category proportional to p(k|f)
        if self._hub_type.startswith("hub-categories-prob"):
            selected_words = self.select_words_from_categ(post_prob_k)

        else:
            categ_words = self._categories[selected_category_id]._words.keys(
            )[:]
            categ_words_num = len(categ_words)

            # when hub-type == hub-categories
            selected_words = categ_words[:]

            if self._hub_type in [
                    "hub-categories-context", "hub-categories-random",
                    "hub-categories-partial"
            ]:
                vert_num = len(self._words_vertex_map.keys())
                hubs_num = round(self._hubs_num * vert_num)
                #hubs_num = self._hubs_num

                if categ_words_num > hubs_num:
                    indices = self.random_selection(hubs_num, 0,
                                                    categ_words_num - 1)
                    selected_words = []
                    for index in indices:
                        selected_words.append(categ_words[index])

        categ_hubs = []
        for oth_word in selected_words:
            oth_node = self._words_vertex_map[oth_word]
            categ_hubs.append([oth_node, 1])

        return categ_hubs

    def add_word(self, context, word, acq_score, lexicon, last_time, ctime,
                 beta, simtype):
        ''' add a new word to the graph or update its connections '''

        marked = set([])  # Mark vertices that already visited
        word_m = lexicon.meaning(word)
        #        word_top_features =self.select_features(word_m._meaning_probs)

        # add the word to the graph
        if not word in self._words_vertex_map:
            self._words_vertex_map[word] = self._graph.add_vertex()
            self._graph.vertex_properties["label"][
                self._words_vertex_map[word]] = word
            self._graph.vertex_properties["freq"][
                self._words_vertex_map[word]] = 0

            if len(self._highest_degree_nodes) < self._hubs_num:
                self._highest_degree_nodes.append(
                    [self._words_vertex_map[word], 0])

        # if the words was in the graph, update its connections
        #TODO Investigate if we need to do this.
        else:
            vertex = self._words_vertex_map[word]
            edges = list(vertex.out_edges())
            for edge in edges:
                target_w = self._graph.vertex_properties["label"][
                    edge.target()]

                if target_w == word:
                    target_w = self._graph.vertex_properties["label"][
                        edge.source()]
                    print "ERROR"

                target_w_m = lexicon.meaning(target_w)
                #target_w_top_features = self.select_features(target_w_m._meaning_probs)

                marked.add(self._words_vertex_map[target_w])

                self.add_edge(word, target_w, word_m, target_w_m, beta,
                              simtype)
                self._graph.remove_edge(edge)

            self.update_computations.append(len(marked))

        vert = self._words_vertex_map[word]
        self._graph.vertex_properties["acqscore"][vert] = acq_score

        self._graph.vertex_properties["freq"][vert] = \
        self._graph.vertex_properties["freq"][vert] + 1

        self.update_most_list(vert,
                              self._graph.vertex_properties["freq"][vert],
                              self._most_frequent_nodes, self._hubs_num,
                              "freq")

        categ_hubs = []
        hubs = []
        '''
        vert_num = len(self._words_vertex_map.keys())
        #number of comparisons
        hubs_num = int(round(self._hubs_num * vert_num))
        #deduct the number of used comparisons, ie, # of current edges that are updated.
        hubs_num -= len(marked) 
        if not (hubs_num in ["hub-categories", "hub-categories-prob", \
                "hub-categories-partial", "hub-context", "hub-random",\
                "hub-freq", "hub-degree"]):
            hubs_num  = hubs_num // 2
        '''

        if self._hub_type.startswith("hub-categories"):
            categ_hubs = self._add_word_to_category(
                word, word_m)  #, lexicon, marked, beta, simtype) #TODO

        if not (self._hub_type in [
                "hub-categories", "hub-categories-partial",
                "hub-categories-prob"
        ]):
            hubs = self.select_hubs(context)

#        print word

        if self._hub_type in ["hub-random", "hub-context", "hub-context-random",\
                "hub-degree", "hub-degree-random", "hub-freq", "hub-freq-random"] \
                or self._hub_type.startswith("hub-categories"):
            # "hub-categories", "hub-categories-context", "hub-categories-random", "hub-categories-partial"]:
            update_num = 0
            # calculate similarity of the word and hub
            for hub, score in (hubs + categ_hubs):
                if hub in marked: continue
                marked.add(hub)

                hword = self._graph.vertex_properties["label"][hub]
                hword_m = lexicon.meaning(hword)
                edge_added = self.add_edge(word, hword, word_m, hword_m, beta,
                                           simtype)
                update_num += 1

            self.new_computations.append(update_num)
        '''               
        #TODO WE ARE NOT USING THIS
        else:
            # calculate similarity of the word and hub
            for hub, score in hubs:
                if hub in marked: continue
                marked.add(hub)

                hword = self._graph.vertex_properties["label"][hub]
                hword_m = lexicon.meaning(hword)

                edge_added = self.add_edge(word, hword, word_m, hword_m, beta, simtype)
                if not edge_added: continue

                for neighbor in hub.all_neighbours():
                    if neighbor in marked: continue
                    marked.add(neighbor)
                    neighbor_word = self._graph.vertex_properties["label"][neighbor]
                    nword_m = lexicon.meaning(word)
                    
                    self.add_edge(word, neighbor_word, word_m, nword_m, beta, simtype)
        '''
        # calculate the number of computations
        self.max_computations.append(self._graph.num_vertices())

        #print "number of computations" ,  len(marked)

        self.computations.append(len(marked))

    def plot(self, graph, filename):
        """ Plot a graph """
        #        ebet = betweenness(graph)[1]
        name = graph.vertex_properties["label"]
        #        acq_scores = graph.vertex_properties["acqscore"]
        distances = graph.edge_properties["distance"]
        deg = graph.degree_property_map("total")
        pos = sfdp_layout(graph)
        #arf_layout(graph)
        graph_draw(graph, pos= pos, vertex_text=name, vertex_font_size=12, vertex_fill_color= deg, vorder=deg,\
        edge_pen_width=distances, output=filename + "graph.png", output_size=(3000,3000), nodesfirst=False)

    def print_hubs(self, filename, last_time, ctime):
        """ Print hubs of the graph """

        hubs = self.select_hubs([])
        stat_file = open(filename, 'a')
        stat_file.write("\nThe final hubs of the semantic network:\n")
        st = ""
        if hubs != None:
            for hub, score in hubs:
                st += self._graph.vertex_properties["label"][hub] + ","
            stat_file.write(st + "\n")
        stat_file.close()

    def print_computations(self, filename):

        stat_file = open(filename, 'a')
        (avg,
         std) = np.mean(self.max_computations), np.std(self.max_computations)
        stat_file.write("\navg maximum computations over words:" +
                        "%.2f +/- %.2f" % (avg, std) + "\n")

        (avg, std) = np.mean(self.computations), np.std(self.computations)
        stat_file.write("avg actual computations over words:" +
                        "%.2f +/- %.2f" % (avg, std) + "\n")

        (avg, std) = np.mean(self.update_computations), np.std(
            self.update_computations)
        stat_file.write("avg update computations over words:" +
                        "%.2f +/- %.2f" % (avg, std) + "\n")

        (avg,
         std) = np.mean(self.new_computations), np.std(self.new_computations)
        stat_file.write("avg new computations over words:" + "%.2f +/- %.2f" %
                        (avg, std) + "\n")

        stat_file.close()

    def calc_distances(self, graph):
        distance = {}
        for v in graph.vertices():
            w = graph.vertex_properties["label"][v]
            if not w in distance.keys():
                distance[w] = {}

            distmap = shortest_distance(
                graph, v, weights=graph.edge_properties["distance"])  #TODO
            for othv in graph.vertices():
                othw = graph.vertex_properties["label"][othv]
                if othw == w:
                    continue
                distance[w][othw] = distmap[othv]
        return distance

    def calc_graph_ranks(self, graph_distances):
        # Rank the targets for each cue in the graph
        graph_ranks = {}
        for cue in graph_distances:
            graph_ranks[cue] = {}

            sorted_targets = []
            for target in graph_distances[cue]:
                sorted_targets.append([target, graph_distances[cue][target]])
            sorted_targets = sorted(sorted_targets, key=itemgetter(1))

            max_rank = 100000
            for ind in range(len(sorted_targets)):
                if sorted_targets[ind][
                        1] == sys.float_info.max or sorted_targets[ind][
                            1] == 2147483647:
                    rank = max_rank
                else:
                    rank = ind + 1

                graph_ranks[cue][sorted_targets[ind][0]] = rank

        return graph_ranks

    def calc_correlations(self, gold_sim, distances, consider_notconnected):
        print "calc_correlations"
        graph_pairs, gold_pairs = [], []
        not_connected = 0
        all_pairs = 0.0

        for cue in gold_sim:
            for target, score in gold_sim[cue]:
                all_pairs += 1

                if distances[cue][target] ==  sys.float_info.max or \
                    distances[cue][target]== 2147483647:
                    not_connected += 1

                    #print cue, target, score, distances[cue][target]

                    if not consider_notconnected:
                        continue

                gold_pairs.append(score)  #TODO sim vs distance
                graph_pairs.append(distances[cue][target])
        print "--------------------"
        if len(graph_pairs) == 0:
            print "nothing matched"
            return (0.0, 0.0), (0.0, 0.0), 0.0

        #pearson_r, pearson_pvalue
        pearson = scipy.stats.pearsonr(gold_pairs, graph_pairs)
        #spearman_t, spearman_pvalue
        spearman = scipy.stats.spearmanr(gold_pairs, graph_pairs)

        print "not connected", not_connected, all_pairs
        return pearson, spearman, not_connected / all_pairs

    def calc_median_rank(self, gold_sims, graph_ranks):
        """ calculate the median rank of the first five associates """

        ranks = {}
        for r in range(5):
            ranks[r] = []

        for cue in gold_sims:
            for index in range(min(len(gold_sims[cue]), 5)):
                target = gold_sims[cue][index][0]
                target_rank = graph_ranks[cue][target]
                ranks[index].append(target_rank)

        return ranks

    def evaluate_semantics(self, graph_distances, graph_ranks, gold_sim,
                           filename, gold_name):

        ranks = self.calc_median_rank(gold_sim, graph_ranks)
        for index in ranks:
            print ranks[index]

        stat_file = open(filename, 'a')
        stat_file.write("evaluation using " + gold_name + "\n")
        stat_file.write("median rank of first five associates for " +
                        str(len(gold_sim.keys())) + " cues\n")
        for i in range(len(ranks.keys())):
            #print ranks[i], numpy.median(ranks[i])
            stat_file.write(str(i+1) + " associate. number of cue-target pairs: %d" % len(ranks[i]) +\
            " median rank: %.2f" %  np.median(ranks[i])+"\n")

        # Calc correlations
        pearson, spearman, not_connected = self.calc_correlations(
            gold_sim, graph_distances, False)
        stat_file.write(
            "\n Not considering pairs that are not connected in the graph\n")
        stat_file.write("pearson  correlation %.2f p-value %.2f" % pearson +
                        "\n")
        stat_file.write("spearman correlation %.2f p-value %.2f" % spearman +
                        "\n")
        stat_file.write(
            "cue-target pairs that are not_connected in the graph %.2f" %
            not_connected + "\n\n")

        pearson, spearman, not_connected = self.calc_correlations(
            gold_sim, graph_distances, True)
        stat_file.write(
            "Considering pairs that are not connected in the graph\n")
        stat_file.write("pearson  correlation %.2f p-value %.2f" % pearson +
                        "\n")
        stat_file.write("spearman correlation %.2f p-value %.2f" % spearman +
                        "\n")
        stat_file.write(
            "cue-target pairs that are not_connected in the graph %.2f" %
            not_connected + "\n\n")

    def evaluate(self, last_time, current_time, gold_lexicon, learned_lexicon,
                 beta, simtype, data_path, filename):
        words = self._words_vertex_map.keys()

        #gold_graph = self.create_final_graph(words, gold_lexicon, beta, simtype)
        #learned_graph = self.create_final_graph(words, learned_lexicon, beta, simtype)

        grown_graph = self._graph

        if self._hub_type != "hub-categories":
            self.print_hubs(filename + "_grown.txt", last_time,
                            current_time)  #CHECK

        self.print_computations(filename + "_grown.txt")  #CHECK

        #nelson_norms = process_norms(data_path +"/norms/", words)
        #wn_jcn_sims = wordnet_similarity(words, "jcn", self._wnlabels)
        wn_wup_sims = wordnet_similarity(words, "wup", self._wnlabels)
        #wn_path_sims = wordnet_similarity(words, "path",self._wnlabels)

        #rg_norms = process_rg_norms(data_path+"/Rubenstein-Goodenough.txt", words)
        #wordsims353_norms = process_rg_norms(data_path + "/wordsim353/combined.tab",words)

        for g, tag in [[
                grown_graph, "_grown"
        ]]:  #, [gold_graph, "_gold"], [learned_graph, "_learned"]]:
            #    self.plot(g, filename + tag + "_")
            self.calc_small_worldness(g, filename + tag)

            distances = self.calc_distances(g)
            graph_ranks = self.calc_graph_ranks(distances)

            self.evaluate_semantics(distances, graph_ranks, wn_wup_sims,
                                    filename + tag + ".txt",
                                    "wordnet using WUP sim measure")

        #    self.evaluate_semantics(distances, graph_ranks, nelson_norms, filename + tag + ".txt", "Nelson norms")
        #    self.evaluate_semantics(distances, graph_ranks, wn_jcn_sims, filename + tag + ".txt", "wordnet using JCN sim measure")
        #    self.evaluate_semantics(distances, graph_ranks, wn_path_sims, filename + tag + ".txt", "wordnet using Path sim measure")
        #    self.evaluate_semantics(distances, graph_ranks, rg_norms, filename + tag + ".txt", "Rubenstein-Goodenough norms")

        #    self.evaluate_semantics(distances, graph_ranks, wordsims353_norms, filename + tag + ".txt", "Wordsim353 norms")

    def calc_small_worldness(self, graph, filename):
        avg_c, median_sp = self.calc_graph_stats(graph, filename)

        rand_graph = Graph(graph)
        rejection_count = random_rewire(rand_graph, "erdos")
        print "rejection count", rejection_count
        rand_avg_c, rand_median_sp = self.calc_graph_stats(
            rand_graph, filename)

        stat_file = open(filename + ".txt", 'a')
        stat_file.write("small-worldness %.3f" %
                        ((avg_c / rand_avg_c) /
                         (float(median_sp) / rand_median_sp)) + "\n\n")
        stat_file.close()

    def calc_graph_stats(self, graph, filename):
        """ calc graph stats """
        """Average Local Clustering Coefficient"""
        local_clust_co = local_clustering(graph)
        avg_local_clust = vertex_average(graph, local_clust_co)
        """Average Degree (sparsity)"""
        avg_total_degree = vertex_average(graph, "total")

        nodes_num = graph.num_vertices()
        edges_num = graph.num_edges()
        """ Largest Component of the Graph"""
        lc_labels = label_largest_component(graph)

        lc_graph = Graph(graph)
        lc_graph.set_vertex_filter(lc_labels)
        lc_graph.purge_vertices()
        """Average Shortest Path in LCC"""
        lc_distances = lc_graph.edge_properties["distance"]
        dist = shortest_distance(lc_graph)  #, weights=lc_distances) #TODO
        dist_list = []
        for v in lc_graph.vertices():
            dist_list += list(dist[v].a)
        """ Median Shortest Path """
        distances = graph.edge_properties["distance"]  #TODO
        gdist = shortest_distance(graph)  #, weights=distances)
        graph_dists = []
        counter = 0
        for v in graph.vertices():
            for othv in gdist[v].a:
                if othv != 0.0:  # not to include the distance to the node
                    graph_dists.append(othv)
                else:
                    counter += 1

    #  print "num v", graph.num_vertices(), counter
        median_sp = np.median(graph_dists)
        #  print "median", median_sp#, graph_dists

        stat_file = open(filename + ".txt", 'a')
        stat_file.write("number of nodes:" + str(nodes_num) +
                        "\nnumber of edges:" + str(edges_num) + "\n")
        stat_file.write("avg total degree:" +
                        "%.2f +/- %.2f" % avg_total_degree + "\n")
        stat_file.write("sparsity:" + "%.2f" %
                        (avg_total_degree[0] / float(nodes_num)) + "\n")

        stat_file.write("number of nodes in LLC:" +
                        str(lc_graph.num_vertices()) +
                        "\nnumber of edges in LLC:" +
                        str(lc_graph.num_edges()) + "\n")
        stat_file.write("connectedness:" + "%.2f" %
                        (lc_graph.num_vertices() / float(nodes_num)) + "\n")
        stat_file.write("avg distance in LCC:" + "%.2f +/- %.2f" %
                        (np.mean(dist_list), np.std(dist_list)) + "\n\n")

        stat_file.write("avg local clustering coefficient:" +
                        "%.2f +/- %.2f" % avg_local_clust + "\n")
        stat_file.write("median distnace in graph:" + "%.2f" % median_sp +
                        "\n\n")

        # Plotting the degree distribution
        ''' 
        plt.clf()
        hist = vertex_hist(graph, "total")
        prob_hist = []
        sum_hist = sum(hist[0])
        for h in hist[0]:
            prob_hist.append(h/float(sum_hist))

        plt.plot(hist[1][1:], prob_hist, 'ro')#, normed=False, facecolor='green', alpha=0.5)
        plt.xlabel('K')
        plt.gca().set_yscale("log")
        plt.gca().set_xscale("log")
        plt.ylabel('P(K)')
        #plt.title(r'Histogram of degrees of the graph')
        #data_1 = graph.degree_property_map("total").a#, graph.degree_property_map, len( graph.degree_property_map("total").a)
        #fit = powerlaw.Fit(data_1) TODO
        #stat_file.write("alpha of powerlaw " + str(fit.power_law.alpha) + "\n\n")
        #print fit.power_law.xmin
        #fit.plot_pdf(ax=plt.gca(),  linewidth=3, color='b')
        #fit.power_law.plot_pdf(ax=plt.gca(), color='g', linestyle='--')
        plt.savefig(filename + "_loglog_degree_histogram.png")
        
        plt.clf()
        plt.plot(hist[1][1:], prob_hist, 'ro')#, normed=False, facecolor='green', alpha=0.5)
        plt.xlabel('K')
        plt.ylabel('P(K)')
        #
        plt.savefig(filename + "_degree_histogram.png")
        '''

        stat_file.close()

        return avg_local_clust[0], median_sp

    def create_final_graph(self, words, lexicon, beta, simtype):
        """ create a graph, given a set of words and their meanings """

        graph = Graph(directed=False)
        graph.vertex_properties["label"] = graph.new_vertex_property("string")
        graph.edge_properties["distance"] = graph.new_edge_property("double")
        graph.vertex_properties["acqscore"] = graph.new_vertex_property(
            "double")

        word_vertex_map = {}

        for word in words:
            word_vertex_map[word] = graph.add_vertex()
            graph.vertex_properties["label"][word_vertex_map[word]] = word

        for word in words:
            for otherword in words:
                if word == otherword:
                    continue

                vert = word_vertex_map[word]
                othervert = word_vertex_map[otherword]

                if graph.edge(vert, othervert) != None or graph.edge(
                        othervert, vert) != None:
                    continue

                word_m = lexicon.meaning(word)
                #                word_m_top_features = self.select_features(word_m._meaning_probs)

                otherword_m = lexicon.meaning(otherword)
                #                otherword_m_top_features = self.select_features(otherword_m._meaning_probs)

                #sim = self.calculate_similarity(word_m_top_features, otherword_m_top_features)
                sim = evaluate.calculate_similarity(beta, word_m, otherword_m,
                                                    simtype)

                if sim >= self._sim_threshold:
                    new_edge = graph.add_edge(vert, othervert)
                    graph.edge_properties["distance"][new_edge] = max(
                        0, 1 - sim)  #distance #TODO

        return graph