def find_events_from_wikipedia_baseline(self, word, max_events_per_year, years, include_score=False, min_occurrences=5): """ find for each given year: events that contain the given word the most times """ if not word: return None word = word.lower() key_years_to_events = OrderedDict([(year, []) for year in years]) for key_year in years: # find the key events from that year top_key_events = MaxHeap(max_events_per_year) # take the events that are most similar to the event for e in self.year_to_event[key_year]: # count number of occurrences of the given word in the Wiki content score = sum(1 for _ in re.finditer( r'\b%s\b' % re.escape(word), self.event_to_text_content[e].lower())) if score > min_occurrences: top_key_events.add(score, e) top_key_events = sorted(top_key_events.heap, reverse=True) key_years_to_events[key_year] = [ item[1] + '--' + str(round(item[0], 2)) if include_score else item[1] for item in top_key_events ] return key_years_to_events
def getSkyline(self, buildings): """ :type buildings: List[List[int]] :rtype: List[List[int]] """ if len(buildings) <= 0: return data = [[build[0], build[2], 0] for build in buildings] data += [[build[1], build[2], 1] for build in buildings] data = sorted(data, key=lambda x: (x[0], x[1])) print data heap = MaxHeap() print heap.size() result = [] for item in data: x, h, tag = item[0:] pre_height = -heap.top() if heap.size() > 0 else 0 if tag == 0: heap.add(-h) #top is the lowest height print 'push', -h else: print 'pop', heap.top() heap.pop() #pop the lowest height cur_height = -heap.top() if heap.size() > 0 else 0 if tag == 0 and cur_height > pre_height: result.append([x, h]) elif tag == 1 and h > cur_height: result.append([x, cur_height]) print item, pre_height, cur_height, result return result
def find_key_events_by_word(self, word, max_events_per_year, years, include_score=False): """ find events closest to the given word """ if not word: return None word = word.lower() key_years_to_events = OrderedDict([(year, []) for year in years]) for key_year in years: model = self.get_model(key_year) # find the key events from that year top_key_events = MaxHeap(max_events_per_year) # take the events that are most similar to the event events = self.get_relevant_events(key_year) for e in events: if word in self.event_to_content[ e] and model.contains_all_words([e, word]): similarity = model.similarity(e, word) if similarity > self.knn_threshold: top_key_events.add(similarity, e) top_key_events = sorted(top_key_events.heap, reverse=True) key_years_to_events[key_year] = [ item[1] + '--' + str(round(item[0], 2)) if include_score else item[1] for item in top_key_events ] return key_years_to_events
def find_key_events_by_classifier(self, word, min_classifier_score, max_events_per_year, existing_key_years_to_events, include_score=False): """ find important events using our events classifier, and word2vec similarities as a filter. 'key_years_to_events' should be calculated by another method ('find_key_events_...'), preferably with a bigger max_events_num, as we don't want to just filter an existing method. """ if not word: return None word = word.lower() key_years_to_events = OrderedDict([(year, []) for year in all_years]) for key_year, top_events_scores in existing_key_years_to_events.items( ): if not top_events_scores: continue # run the classifier for these events event_to_features = {} event_to_prev_method_score = {} for event, score in top_events_scores: event_to_prev_method_score[event] = float(score) feature_vector, feature_names = self.classifier.featurize_event_word( (event, word)) if feature_vector is not None: event_to_features[event] = feature_vector probs = list( self.classifier.classifier.classifier.predict_proba( list(event_to_features.values())) ) # probabilities for the true class y_prob = np.array(probs)[:, 1] top_key_events = MaxHeap(max_events_per_year) for event_i, event in enumerate(list(event_to_features.keys())): event_score = (y_prob[event_i] * 4 + event_to_prev_method_score[event] * 6) / 10 top_key_events.add(event_score, event) top_key_events = sorted(top_key_events.heap, reverse=True) key_years_to_events[key_year] = [ item[1] + '--' + str(round(item[0], 2)) if include_score else item[1] for item in top_key_events if item[0] > min_classifier_score ] return key_years_to_events
class PriorityQueue(QueueBase): def __init__(self): self._max_heap = MaxHeap() def get_size(self): return self._max_heap.size() def is_empty(self): return self._max_heap.is_empty() def get_front(self): return self._max_heap.find_max() def enqueue(self, e): self._max_heap.add(e) def dequeue(self): return self._max_heap.extract_max()
def find_key_events_by_knn(self, word, max_events_per_year, years, include_score=False): """ find events that are closest to the given word and its nearest neighbors """ if not word: return None word = word.lower() year_to_similar_words = self.get_similar_words_per_year(word) key_years_to_events = OrderedDict([(year, []) for year in years]) for key_year in years: model = self.get_model(key_year) # find the key events from that year top_key_events = MaxHeap(max_events_per_year) # take the events that are most similar to the KNN word_knn = [word] + year_to_similar_words[ key_year] if year_to_similar_words[key_year] is not None else [ word ] events = self.get_relevant_events(key_year) for e in events: knn_similarities = [ model.similarity(e, sim_word) for sim_word in word_knn if word in self.event_to_content[e] and model.contains_all_words([e, sim_word]) ] if len(knn_similarities) > 0: similarity = np.mean(knn_similarities) if similarity > self.knn_threshold: top_key_events.add(similarity, e) top_key_events = sorted(top_key_events.heap, reverse=True) key_years_to_events[key_year] = [ (item[1], str(round(item[0], 2))) if include_score else item[1] for item in top_key_events ] return key_years_to_events
def knn_search(self, Xi): tree = self.tree heap = MaxHeap(self.k_neighbors, lambda x: x.dist) # 搜索Xi时,从根节点到叶节点的路径 nd = tree.search(Xi, tree.root) # 初始化队列 que = [(tree.root, nd)] while que: # 计算Xi和根节点的距离 nd_root, nd_cur = que.pop(0) nd_root.dist = tree.get_eu_dist(Xi, nd_root) heap.add(nd_root) while nd_cur is not nd_root: # 计算Xi和当前节点的距离 nd_cur.dist = tree.get_eu_dist(Xi, nd_cur) # 更新最好的节点和距离 heap.add(nd_cur) if nd_cur.brother and (not heap or heap.items[0].dist > tree.get_hyper_plane_dist(Xi, nd_cur.father)): _nd = tree.search(Xi, nd_cur.brother) que.append((nd_cur.brother, _nd)) nd_cur = nd_cur.father return heap