def getContextWordsLargestVariance(self, contextWordTotalDict):
    heap = []
    cwList = []

    nrQueryWords = len(self.queryWords)
    smallestVariance = sys.maxint

    for contextWord, queryDict in contextWordTotalDict.iteritems():
      l = []
      for queryWord in self.queryWords:
        l.append(queryDict[queryWord])

      total = sum(l)
      avg = total / float( nrQueryWords)
      variance = utilities.variance(avg, l, nrQueryWords)
      #print variance

      if len(heap) < self.maxContextWords:
        heapq.heappush(heap, (variance, contextWord))
        if variance < smallestVariance :
          smallestVariance = variance

      else :
        if variance > smallestVariance:
          heapq.heappushpop(heap, (variance, contextWord))
          smallestVariance = heap[0][0]

    while heap:
      cw = heapq.heappop(heap) # order: small to large
      cwList.append( cw[1] )

    return cwList
def best_n_words(n):
    """
    This method finds the top N words which
    the classifier mostly relies on.
    :param n: number of words returned
    :return: a list of indices of the top N words which
    the classifier mostly relies on.
    """
    effectiveness = word_efficiency_measure()
    heap = []
    vocabulary = np.loadtxt("data/vocabulary.txt", dtype='string')
    # Push the first 100 element into the heap.
    for i in range(n):
        heapq.heappush(heap, (effectiveness[i], vocabulary[i], i))
    # Add the rest of elements to the heap but
    # remove the smallest before that.
    for i in np.arange(n, len(effectiveness)):
        heapq.heappushpop(heap, (effectiveness[i], vocabulary[i], i))
    top_words = []
    top_words_count = np.zeros([n, Data.label_number])
    for i in range(n):
        heap_tuple = heapq.heappop(heap)
        top_words.insert(0, heap_tuple[1])
        # record the occurrences of the word in each category of texts
        top_words_count[n - 1 - i, :] = Data.word_count_conditioned[:, heap_tuple[2]]
    np.savetxt("TopWords", top_words, "%s")
    np.savetxt("TopWordsCountsInDifferentCategories", top_words_count, "%d")
Example #3
0
 def _find_candidates_window_n(self, word, k=1, prop_threshold=1e-6):
     threshold = log(prop_threshold)
     word = '⟬{}⟭'.format(word.lower().replace('ё', 'е'))
     word_len = len(word) + 1
     inf = float('-inf')
     d = defaultdict(list)
     d[''] = [0.] + [inf] * (word_len - 1)
     prefixes_heap = [(0, self.dictionary.words_trie[''])]
     candidates = [(inf, '')] * k
     while prefixes_heap and -prefixes_heap[0][0] > candidates[0][0]:
         _, prefixes = heappop(prefixes_heap)
         for prefix in prefixes:
             prefix_len = len(prefix)
             d[prefix] = res = [inf]
             for i in range(1, word_len):
                 c_res = [inf]
                 for li in range(1, min(prefix_len + 1, self.window + 2)):
                     for ri in range(1, min(i + 1, self.window + 2)):
                         prev = d[prefix[:-li]][i - ri]
                         if prev > threshold:
                             edit = (prefix[-li:], word[i - ri:i])
                             if edit in self.costs:
                                 c_res.append(prev +
                                              self.costs[edit])
                 res.append(max(c_res))
             if prefix in self.dictionary.words_set:
                 heappushpop(candidates, (res[-1], prefix))
             potential = max(res)
             # potential = max(
             #     [e for i in range(self.window + 2) for e in d[prefix[:prefix_len - i]]])
             if potential > threshold:
                 heappush(prefixes_heap, (-potential, self.dictionary.words_trie[prefix]))
     return [(w.strip('⟬⟭'), score) for score, w in sorted(candidates, reverse=True) if
             score > threshold]
 def add(self, item):
     if item is None:
         return
     if len(self._heap) < self._n:
         heapq.heappush(self._heap, item)
     else:
         heapq.heappushpop(self._heap, item)
def tr_stage1(readlines, min_len, bestn, rid_to_ctg, rid_to_phase):
    """
    for each read in the b-read column inside the LAS files, we
    keep top `bestn` hits with a priority queue through all overlaps
    """

    rtn = {}
    for l in readlines():
        l = l.strip().split()
        q_id, t_id = l[:2]
        overlap_len = -int(l[2])
        idt = float(l[3])
        q_s, q_e, q_l = int(l[5]), int(l[6]), int(l[7])
        t_s, t_e, t_l = int(l[9]), int(l[10]), int(l[11])
        if t_l < min_len:
            continue
        if q_id not in rid_to_ctg:
            continue

        t_phase = rid_to_phase[ int(t_id) ]
        if t_phase != None:
            ctg_id, block, phase = t_phase
            if block != -1:
                q_phase = rid_to_phase[ int(q_id) ]
                if q_phase != None:
                    if q_phase[0] == ctg_id and q_phase[1] == block and q_phase[2] != phase:
                        continue

        rtn.setdefault(t_id, [])
        if len(rtn[t_id]) < bestn:
            heappush(rtn[t_id], (overlap_len, q_id) )
        else:
            heappushpop(rtn[t_id], (overlap_len, q_id) )

    return rtn
Example #6
0
 def add_point(self, new_point):
     new_pair = OrderedPoint(new_point,
                             self.distance(new_point, self.point))
     if len(self.near) == self.k:
         heapq.heappushpop(self.near, new_pair)
     else:
         heapq.heappush(self.near, new_pair)
Example #7
0
 def _find_candidates_window_0(self, word, k=1, prop_threshold=1e-6):
     threshold = log(prop_threshold)
     d = {}
     prefixes_heap = [(0, {''})]
     candidates = [(float('-inf'), '') for _ in range(k)]
     word = '⟬{}⟭'.format(word.lower().replace('ё', 'е'))
     word_len = len(word) + 1
     while prefixes_heap and -prefixes_heap[0][0] > candidates[0][0]:
         _, prefixes = heappop(prefixes_heap)
         for prefix in prefixes:
             res = []
             for i in range(word_len):
                 c = word[i - 1:i]
                 res.append(max(
                     (res[-1] + self.costs[('', c)]) if i else float('-inf'),
                     d[prefix[:-1]][i] + self.costs[(prefix[-1], '')] if prefix else float(
                         '-inf'),
                     (d[prefix[:-1]][i - 1] + (self.costs[(prefix[-1], c)]))
                     if prefix and i else float('-inf')
                 ) if i or prefix else 0)
             d[prefix] = res
             if prefix in self.dictionary.words_set:
                 heappushpop(candidates, (res[-1], prefix))
             potential = max(res)
             if potential > threshold:
                 heappush(prefixes_heap, (-potential, self.dictionary.words_trie[prefix]))
     return [(w.strip('⟬⟭'), score) for score, w in sorted(candidates, reverse=True) if
             score > threshold]
	def update_from(self, contents, filename = "", verbose = False):
		#
		# Add the live times from this file to the totals.
		#

		if verbose:
			print >>sys.stderr, "measuring live time ..."
		zero_lag_time_slides, background_time_slides = SnglBurstUtils.get_time_slides(contents.connection)
		self.zero_lag_live_time += SnglBurstUtils.time_slides_livetime(contents.seglists, zero_lag_time_slides.values(), verbose = verbose)
		self.background_live_time += SnglBurstUtils.time_slides_livetime(contents.seglists, background_time_slides.values(), verbose = verbose)

		#
		# Iterate over burst<-->burst coincidences.  Assume there
		# are no injections in this file.
		#

		if verbose:
			print >>sys.stderr, "retrieving sngl_burst<-->sngl_burst coincs ..."
		for id, likelihood, confidence, is_background in bb_id_likelihood_confidence_background(contents):
			record = coinc_detection_statistic(likelihood, confidence)
			if is_background:
				# remember the total number, but only keep
				# the highest ranked
				self.n_background_amplitudes += 1
				if len(self.background_amplitudes) < 1e7:
					heapq.heappush(self.background_amplitudes, record)
				else:
					heapq.heappushpop(self.background_amplitudes, record)
			else:
				self.zero_lag_amplitudes.append((record, filename, id))
		if verbose:
			print >>sys.stderr, "done"
Example #9
0
 def push(self, x):
     """Pushes a new element."""
     assert self._data is not None
     if len(self._data) < self._n:
         heapq.heappush(self._data, x)
     else:
         heapq.heappushpop(self._data, x)
Example #10
0
def rank_documents(query):
    result = []
    num_results = 10

    query_list = query.strip().split()
    query_list = map(process_word, query_list)

    scores = {}

    for word in query_list: 

        # Process only if word exists
        if word_in_dict(word):
            weight_tq = log_term_freq(query_list.count(word)) * inv_doc_freq(word)
            postings_list = get_postings_list(word)

            for docID, tf in postings_list:
                if docID not in scores:
                    scores[docID] = calc_weight_td(docID, tf) * weight_tq
                else:
                    scores[docID] += calc_weight_td(docID, tf) * weight_tq

    res = []
    for doc in scores:
        if scores[doc] > 0:
            if (len(res) < num_results):        #if the heap is not full, continue adding
                res.append((float(scores[doc])/float(doc_len[doc]),int(doc)))
            else:                               #if it's full, then push and pop to maintain the size of the heap
                heapq.heappushpop(res, (float(scores[doc])/float(doc_len[doc]),int(doc)))

    return sorted(res, reverse=True)
Example #11
0
def nsmallest(n, a):
    h = []
    for e in a[:n]:
        heapq.heappush(h, -e)  # we use a max heap but push -1 * e
    for e in a[n:]:
        heapq.heappushpop(h, -e)
    return -heapq.heappop(h)
    def find_for_user(self, user_plays):
        if not user_plays['value']:
            return []

        scores = []

        song_ids = user_plays['value']
        lsh_bucket = []

        for i in range(1, len(song_ids)):
            if len(lsh_bucket) > MAX_SIZE_OF_BUCKET:
                break

            lsh_bucket_key = song_ids[i]
            if lsh_bucket_key in self.buckets:
                candidate_bucket = self.buckets[lsh_bucket_key]
                if len(candidate_bucket) <= MAX_SIZE_OF_BUCKET - len(lsh_bucket):
                    lsh_bucket += list(self.buckets[lsh_bucket_key])

        # for bucket in self.buckets.itervalues():
        #     if len(lsh_bucket) > HALF_OF_MAX_SIZE_OF_BUCKET:
        #         break
        #
        #     lsh_bucket += list(bucket)

        for other_user_plays in lsh_bucket:
            jaccard_index = self.compute_jaccard_index(set(user_plays['value']), set(other_user_plays['value']))
            if jaccard_index >= MIN_SIMILARITY:
                value = (jaccard_index, {'user_id': other_user_plays['_id'], 'songs': other_user_plays['value']})
                if len(scores) < self.k:
                    heappush(scores, value)
                else:
                    heappushpop(scores, value)

        return nlargest(self.k, scores)
Example #13
0
    def medianII(self, nums):
        if not nums:
            return []

        import heapq
        max_heap = [-nums[0]]
        min_heap = []
        heapq.heapify(max_heap)
        heapq.heapify(min_heap)

        r = [nums[0]]
        for num in nums[1:]:
            len_max_heap = len(max_heap)
            len_min_heap = len(min_heap)

            if 0 < len_max_heap - len_min_heap <= 1:
                max_top = - max_heap[0]
                if num >= max_top:
                    heapq.heappush(min_heap, num)
                else:
                    heapq.heappush(min_heap, max_top)
                    heapq.heappushpop(max_heap, -num)
            else:
                min_top = min_heap[0]
                if num > min_top:
                    heapq.heappushpop(min_heap, num)
                    heapq.heappush(max_heap, -min_top)
                else:
                    heapq.heappush(max_heap, -num)
            r.append(-max_heap[0])
        return r
    def kSmallestPairs(self, nums1, nums2, k):
        """
        :type nums1: List[int]
        :type nums2: List[int]
        :type k: int
        :rtype: List[List[int]]
        """
        import heapq
        hq = []
        len_nums1 = len(nums1)
        len_nums2 = len(nums2)
        if k > len_nums1 * len_nums2:
            k = len_nums1 * len_nums2
        count = 0
        nums2_start = nums2[0]
        def sort_return(arr):
            ans = []
            while arr:
                ans.append(heapq.heappop(hq)[1])
            return ans[::-1]

        for i in nums1:
            for j in nums2:
                if hq and count >= k and -hq[0][0] < i + nums2_start:
                    return sort_return(hq)[:k]
                else:
                    print hq
                    if count < k:
                        heapq.heappush(hq, (-(i + j), [i, j]))
                    else:
                        heapq.heappushpop(hq, (-(i + j), [i, j]))
                count += 1
        return sort_return(hq)[:k]
Example #15
0
    def top_item(self, u, k):
        heap_n = []  # a priority queue for index nodes
        heap_r = []  # a priority queue for candidate results

        n = self._n_root
        ubound = upper_bound(n._lbound, n._rbound, u)
        heapq.heappush(heap_n, (-1 * ubound, n))

        while (len(heap_n) > 0):
            v_n, n = heapq.heappop(heap_n)
            if not n._children:  # leaf node
                for tid in n._tids:
                    value = self._Q[tid].dot(u)
                    if (len(heap_r) < k):
                        heapq.heappush(heap_r, (value, tid))
                    else:
                        if (value > heap_r[0][0]):
                            heapq.heappushpop(heap_r, (value, tid))
                if (len(heap_r) >= k and heap_r[0][0] >= -1 * v_n):
                    break
            else:  # None leaf node
                for child in n._children:
                    ubound = upper_bound(child._lbound, child._rbound, u)
                    heapq.heappush(heap_n, (-1 * ubound, child))

        res = []
        for r in heap_r:
            res.append(r[1])
        return res
Example #16
0
    def top_urls_tie(self, array, n):
        # Build a hash to count the number of times an URL has been visited: 
        url_count = {}
        for url in array:
            if url not in url_count.keys():
                url_count[url] = 0
            url_count[url] += 1

        # Now compute the most frequency, and build hashs to store URls with the same frequency:
        count_url = {}
        top_frequency = []

        for url in url_count.keys():
            freq = url_count[url]
            if freq not in count_url.keys():
                count_url[freq] = []
            count_url[freq].append(url)
            if len(top_frequency) < n:
                heapq.heappush(top_frequency,freq)
            else:
                heapq.heappushpop(top_frequency,freq)

        result = []
        for freq in top_frequency:
            result.extend(count_url[freq])
            
        return result
 def mapper_rs(self,_,line):
     r = random.randrange(1000000)
     if len(self.pq) < self.n:
         heapq.heappush(self.pq,(r,line))
     else:
         if self.pq[0][0] < r:
            heapq.heappushpop(self.pq,(r,line))
def update_result(in_distances):
    global DISTANCES_MIN
    for dist in in_distances:
        if len(DISTANCES_MIN) == MAX_RESULT_SIZE:
            heapq.heappushpop(DISTANCES_MIN, dist)
        else:
            heapq.heappush(DISTANCES_MIN, dist)
def find_closest_points_sorted(sample, records, excluding, k, distance):
    """
    Find k closest records and return sorted list of them in a sorted way: from closest to furthest; 
    time complexity: O(log(k)*len(records) + klog(k))
    
    A heap is used, to store the k best matches, when iterating through data.
    
    sample - a record, neighbours of which we are lokking for
    records - a generator over records 
    excluding - list of records not to consider
    k - number of closest points to find
    distance - a function distance(record1, record2) -> distance between them
    """
    import heapq
    best_matches = []
    elems_cnt = 0
    
    for r in records:
        if not excluding or r not in excluding:
            curr_err = distance(r, sample)
            if elems_cnt < k:
                heapq.heappush(best_matches, (-curr_err, r))
                elems_cnt += 1
                
            elif -best_matches[0][0] > curr_err:
                #remove the biggest error and add the new element
                heapq.heappushpop(best_matches, (-curr_err, r))
    
    best_matches = sorted(best_matches, key=lambda x: x[0], reverse=True)
    #print '[find_closest_points_sorted] best_matches:', best_matches
    return map(lambda x: x[1], best_matches)
Example #20
0
def find_top_k_with_pytrie(k = 10):
    """
    Too slow for inserts.
        
    memory consuming: 730 MB
    time consuming:  61.0309998989
    (165, 'ts')
    (165, 'um')
    (165, 'yk')
    (165, 'zl')
    (166, 'ky')
    (166, 'sg')
    (167, 'nh')
    (169, 'kp')
    (171, 'eo')
    (172, 'dq')
    """
    result = []
    t = pytrie.SortedStringTrie()
    with open(TDATA) as f:
        for line in f:
            key = line.strip()
            t[key] = t.setdefault(key, 0) + 1

    # heapq
    for e in t.iteritems():
        if len(result) < k:
            heapq.heappush(result, e[::-1])
        else:
            heapq.heappushpop(result, e[::-1])
            
    return result
Example #21
0
def find_top_k_with_rbtree(filename = TDATA, k = 10):
    """
    Profile result:
       
       5 million strings:
       memory consuming: 259 MB
       time consuming: 92.625
       [(753, 'bf'),
        (753, 'qj'),
        (753, 'zb'),
        (753, 'vz'),
        (763, 'ma'),
        (755, 'lx'),
        (779, 'qp'),
        (768, 'bg'),
        (758, 'eq'),
        (767, 'tf')]
    """
    result = []
    t = rbtree.rbtree()
    with open(filename) as f:
        for line in f:
            key = line.strip()
            t[key] = t.setdefault(key, 0) + 1

    # heapq
    for key, val in t.iteritems():
        if len(result) < k:
            heapq.heappush(result, (val, key))
        else:
            heapq.heappushpop(result, (val, key))
    
    return result
Example #22
0
def find_top_k_with_trie(k = 10):
    """
    Too slow and large memory consuming.
    
    time consuming:  147.656000137
    (164, 'mh')
    (164, 'sq')
    (165, 'bi')
    (165, 'mo')
    (167, 'im')
    (168, 'ux')
    (169, 'br')
    (169, 'gj')
    (170, 'ij')
    (171, 'qd')
    """
    result = []
    t = Trie()
    # trie
    with open(TDATA) as f:
        for line in f:
            t.insert(line.strip())
    
    # heapq
    for n in t.ipreorder(t.root):
        if len(result) < k:
            heapq.heappush(result, n)
        else:
            heapq.heappushpop(result, n)
            
    return result
 def maximumProduct(self, nums):
     """
     :type nums: List[int]
     :rtype: int
     """
     import heapq
     k = 3
     min_n = []
     max_n = []
     heapq.heapify(min_n)
     heapq.heapify(max_n)
     
     for n in nums:
         if n < 0:
             if len(min_n) < k:
                 heapq.heappush(min_n, -n)
             elif min_n[0] < -n:
                 heapq.heappushpop(min_n, -n)
                 
         if len(max_n) < k:
             heapq.heappush(max_n, n)
         elif max_n[0] < n:
             heapq.heappushpop(max_n, n)
                 
     min_n = sorted([-n for n in min_n])
     max_n = sorted(max_n)
     
     print max_n
     print min_n
     
     max_v = max_n[0] * max_n[1] * max_n[2]
     if len(min_n) < 2:
         return max_v
     else:
         return max(max_v, min_n[0] * min_n[1] * max_n[-1])
Example #24
0
def fuzzy_set_search(db, key, search_string, size=100):
    """Return the most similar set of values to the search string.

    Parameters
    ----------
    db: databroker.DataBroker instance
        The databroker to be searched
    key: list of str
        The list of strings to be accessed
    search_string: str
        The string to be searched for
    size: int, optional
        The number of results to be returned.
         Defaults to 100 results

    Returns
    -------
    list:
        A list

    Examples
    --------
    >>> db = Broker(...) # Contains runs from Bob, Alice, Bob, and Eve
    >>> fuzzy_set_search(db, 'bt_piLast', 'Bob')
    ['Bob', 'Alice', 'Eve']
    """
    heap = [(-1, -1)] * size  # ndld can't return less than 0
    heapify(heap)
    values = set([h['start'][key] for h in db()])
    for v in values:
        heappushpop(heap, (1. - ndld(v, search_string), v))
    heap.sort()
    heap.reverse()
    return [g[-1] for g in heap if g[0] >= 0.]
Example #25
0
def main():

	if len(sys.argv) != 2:
		sys.stderr.write(USAGE)
		sys.exit(1)

	k = int(sys.argv[1])
	data = read_mapper_output(sys.stdin)

	for qno, group in groupby(data, itemgetter(0)):
		# create heap
		scores = []
		heap_size = 0
		for qno, prefix, docno, rank, score, runid in group:
			if heap_size < k: 
				h.heappush( scores, (score, docno) )
				heap_size += 1
			elif heap_size == k:
				if score > scores[0][0]:
						h.heappushpop(scores, (score, docno) )
				# else: throw away
			else:
				sys.stderr.write("This should not happen!")
				sys.exit(1)
		
		bestK = h.nlargest(k, scores)
		for rank, x in enumerate(bestK, 1):
			score, docno = x
			sys.stdout.write("%s\t%s\t%s\t%d\t%.7f\t%s"%( qno, prefix, docno, rank, score, runid ) )
Example #26
0
 def test_pushpop_time(self):
     with print_time('heapq.heappushpop'):
         for i in range(repeat):
             heappushpop(self.hqs[i], 10000000)
     with print_time('Heap.pushpop'):
         for i in range(repeat):
             self.hs[i].pushpop(10000000)
Example #27
0
def super_fuzzy_search(db, search_string, size=100):
    """Fuzzy search a databroker

    Parameters
    ----------
    db: databroker.DataBroker instance
        The databroker to be searched
    search_string: str
        The string to be searched for
    size: int, optional
        The number of results to be returned.
         Defaults to 100 results

    Returns
    -------
    list:
        A list

    """
    heap = [(-1, -1, -1)] * size  # ndld can't return less than 0
    heapify(heap)
    for h in db():
        internal_scores = [1. - ndld(v, search_string) for v in
                           _nested_dict_values(h['start']) if v is not None]
        heappushpop(heap, (max(internal_scores), h['start']['time'] * -1, h))
    heap.sort()
    heap.reverse()
    return [g[-1] for g in heap if g[0] != -1]
Example #28
0
def get_top_k(m, k=1, diagonal_ignore = True):
	""" given a matrix iterates on its elements and maintains top k in a min heap
		diagonal_ignore => flag to indicate whether to ignore diagnoal elements or not
						   and search half the matrix assuming it is symmetric

		return top k values with indices of element in matrix
		TODO: think about sketching to get top k heavy hitters in this sparse matrix
	"""
	rows = m.shape[0]
	cols = m.shape[1]
	i, j  = 0, 0
	h = []
	for i in range(0, rows):
		for j in range(0, cols):
			#ignore diagonal elements
			if diagonal_ignore and i <= j:
				continue;
			else:
				elem = m[i,j]
				tup = (i, j)    # to track indexes of highest elements
				if len(h) < k:
					hq.heappush(h, (elem, tup))
				else:
					#print h, elem, i, j
					if h[0][0] < elem:
						hq.heappushpop(h, (elem, tup))
	return [hq.heappop(h) for i in range(0, len(h))]
Example #29
0
    def search_in_nodes(self, the_node, target_point, count, the_heap):
        if the_node == None:
            return

        # the_node.index.p()
        # self.points[the_node.index].p()
        distance = cal_distance(self.points[the_node.index], target_point)

        if(distance < self.tau):
            if(len(the_heap) == count):
                heapq.heappushpop(the_heap, (0-distance, the_node.index))
                self.tau = 0 - the_heap[0][0]
                # the_heap.p()
            else:
                heapq.heappush(the_heap, (0-distance, the_node.index))
                # the_heap.p()

        if(the_node.left == None and the_node.right == None):
            return

        if(distance < the_node.threshold):
            self.search_in_nodes(the_node.left, target_point, count, the_heap)
            if (self.tau + distance >= the_node.threshold):
                self.search_in_nodes(the_node.right, target_point, count, the_heap)
        else:
            self.search_in_nodes(the_node.right, target_point, count, the_heap)
            if (self.tau >= distance - the_node.threshold):
                self.search_in_nodes(the_node.left, target_point, count, the_heap)
Example #30
0
def find_top_k_with_datrie(k = 10):
    """
    Too slow for inserts.
        
    time consuming:  896.575999975
    (164, u'mh')
    (164, u'sq')
    (165, u'bi')
    (165, u'mo')
    (167, u'im')
    (168, u'ux')
    (169, u'br')
    (169, u'gj')
    (170, u'ij')
    (171, u'qd')
    """
    result = []
    t = datrie.Trie(string.ascii_lowercase)
    with open(TDATA) as f:
        for line in f:
            key = unicode(line.strip())
            t[key] = t.setdefault(key, 0) + 1

    # heapq
    state = datrie.State(t)
    state.walk(u'A')
    it = datrie.Iterator(state)
    while it.next():
        if len(result) < k:
            heapq.heappush(result, (it.data(), it.key()))
        else:
            heapq.heappushpop(result, (it.data(), it.key()))
            
    return result
 def addNum(self, num: int) -> None:
     heapq.heappush(self.lo, -heapq.heappushpop(self.hi, num))
     if len(self.hi) < len(self.lo):
         heapq.heappush(self.hi, -heapq.heappop(self.lo))
Example #32
0
 def add(self, val: int) -> int:
     if len(self.h) < self.k:
         heapq.heappush(self.h, val)
     else:
         heapq.heappushpop(self.h, val)
     return self.h[0]
def __search(sync_net,
             ini,
             fin,
             cost_function,
             skip,
             ret_tuple_as_trans_desc=False,
             max_align_time_trace=sys.maxsize):
    start_time = time.time()

    decorate_transitions_prepostset(sync_net)
    decorate_places_preset_trans(sync_net)

    incidence_matrix = inc_mat_construct(sync_net)
    ini_vec, fin_vec, cost_vec = utils.__vectorize_initial_final_cost(
        incidence_matrix, ini, fin, cost_function)

    closed = set()
    heu_dict = {}
    heu_max_ind_dict = {}
    mtcgt_dict = {}

    parameters = {}
    parameters[marking_equation.Parameters.FULL_BOOTSTRAP_REQUIRED] = False
    parameters[marking_equation.Parameters.INCIDENCE_MATRIX] = incidence_matrix
    parameters[marking_equation.Parameters.COSTS] = cost_function

    visited = 0
    queued = 0
    traversed = 0
    me = marking_equation.build(sync_net, ini, fin, parameters=parameters)
    h, x = me.solve()
    lp_solved = 1

    # try to see if the firing sequence is already fine
    firing_sequence, reach_fm, explained_events = me.get_firing_sequence(x)
    if reach_fm:
        return __reconstruct_alignment(
            firing_sequence,
            h,
            visited,
            queued,
            traversed,
            ret_tuple_as_trans_desc=ret_tuple_as_trans_desc,
            lp_solved=lp_solved)
    mm, index = __get_model_marking_and_index(ini)
    __update_heu_dict(heu_dict, heu_max_ind_dict, mm, index, h, x,
                      firing_sequence, incidence_matrix, cost_vec)

    ini_state = utils.TweakedSearchTuple(0 + h, 0, h, ini, None, None, x, True,
                                         False)
    open_set = [ini_state]
    heapq.heapify(open_set)

    trans_empty_preset = set(t for t in sync_net.transitions
                             if len(t.in_arcs) == 0)

    while not len(open_set) == 0:
        if (time.time() - start_time) > max_align_time_trace:
            return None

        curr = heapq.heappop(open_set)

        current_marking = curr.m

        while not curr.trust:
            if (time.time() - start_time) > max_align_time_trace:
                return None

            already_closed = current_marking in closed
            if already_closed:
                curr = heapq.heappop(open_set)
                current_marking = curr.m
                continue

            if curr.t not in mtcgt_dict:
                lp_solved += 1
                mtcgt = __min_total_cost_given_trans(me, ini, incidence_matrix,
                                                     curr.t)
                mtcgt_dict[curr.t] = mtcgt
            else:
                mtcgt = mtcgt_dict[curr.t]

            h1 = max(mtcgt - curr.g, 0)
            if h1 > curr.h:
                tp = utils.TweakedSearchTuple(curr.g + h1, curr.g, h1, curr.m,
                                              curr.p, curr.t, curr.x, False,
                                              False)
                curr = heapq.heappushpop(open_set, tp)
                current_marking = curr.m
                continue

            mm, index = __get_model_marking_and_index(curr.m)
            h2, x2, trust2 = __get_heu_from_dict(heu_dict, heu_max_ind_dict,
                                                 mm, index)
            if h2 is not None and h2 > curr.h:
                tp = utils.TweakedSearchTuple(curr.g + h2, curr.g, h2, curr.m,
                                              curr.p, curr.t, x2, trust2,
                                              False)
                curr = heapq.heappushpop(open_set, tp)
                current_marking = curr.m
                continue

            me.change_ini_vec(curr.m)
            h, x = me.solve()

            __update_heu_dict_specific_point(heu_dict, heu_max_ind_dict, mm,
                                             index, h, x)

            lp_solved += 1
            tp = utils.TweakedSearchTuple(curr.g + h, curr.g, h, curr.m,
                                          curr.p, curr.t, x, True, True)
            curr = heapq.heappushpop(open_set, tp)
            current_marking = curr.m

        already_closed = current_marking in closed
        if already_closed:
            continue
        if curr.h < 0.01:
            if current_marking == fin:
                trans_list = __transitions_list_from_state(curr)
                return __reconstruct_alignment(
                    trans_list,
                    curr.f,
                    visited,
                    queued,
                    traversed,
                    ret_tuple_as_trans_desc=ret_tuple_as_trans_desc,
                    lp_solved=lp_solved)

        if curr.virgin:
            # try to see if the firing sequence is already fine
            firing_sequence, reach_fm, explained_events = me.get_firing_sequence(
                curr.x)
            if reach_fm:
                trans_list = __transitions_list_from_state(curr) + list(
                    firing_sequence)
                return __reconstruct_alignment(
                    trans_list,
                    curr.f,
                    visited,
                    queued,
                    traversed,
                    ret_tuple_as_trans_desc=ret_tuple_as_trans_desc,
                    lp_solved=lp_solved)
            mm, index = __get_model_marking_and_index(curr.m)
            __update_heu_dict(heu_dict, heu_max_ind_dict, mm, index, h, x,
                              firing_sequence, incidence_matrix, cost_vec)

        closed.add(current_marking)
        visited += 1

        possible_enabling_transitions = copy(trans_empty_preset)
        for p in current_marking:
            for t in p.ass_trans:
                possible_enabling_transitions.add(t)

        enabled_trans = [
            t for t in possible_enabling_transitions
            if t.sub_marking <= current_marking
        ]

        trans_to_visit_with_cost = [
            (t, cost_function[t]) for t in enabled_trans
            if not (t is not None and utils.__is_log_move(t, skip)
                    and utils.__is_model_move(t, skip))
        ]

        for t, cost in trans_to_visit_with_cost:
            traversed += 1
            new_marking = utils.add_markings(current_marking, t.add_marking)

            if new_marking in closed:
                continue
            g = curr.g + cost

            queued += 1
            h, x = utils.__derive_heuristic(incidence_matrix, cost_vec, curr.x,
                                            t, curr.h)
            trust = utils.__trust_solution(x)
            mm, index = __get_model_marking_and_index(new_marking)

            if not trust:
                h2, x2, trust2 = __get_heu_from_dict(heu_dict,
                                                     heu_max_ind_dict, mm,
                                                     index)
                if h2 is not None and (h2 > h or trust2):
                    h = h2
                    x = x2
                    trust = trust2
            else:
                __update_heu_dict_specific_point(heu_dict, heu_max_ind_dict,
                                                 mm, index, h, x)

            new_f = g + h
            tp = utils.TweakedSearchTuple(new_f, g, h, new_marking, curr, t, x,
                                          trust, False)
            heapq.heappush(open_set, tp)
def inference(reader, train_dir, data_pattern, out_file_location, batch_size,
              top_k):
    with tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True)) as sess, gfile.Open(
                out_file_location, "w+") as out_file:
        video_id_batch, video_batch, num_frames_batch = get_input_data_tensors(
            reader, data_pattern, batch_size)
        inference_model_name = ("segment_inference_model"
                                if FLAGS.segment_labels else "inference_model")
        checkpoint_file = os.path.join(FLAGS.train_dir, "inference_model",
                                       inference_model_name)
        if not gfile.Exists(checkpoint_file + ".meta"):
            raise IOError("Cannot find %s. Did you run eval.py?" %
                          checkpoint_file)
        meta_graph_location = checkpoint_file + ".meta"
        logging.info("loading meta-graph: " + meta_graph_location)

        if FLAGS.output_model_tgz:
            with tarfile.open(FLAGS.output_model_tgz, "w:gz") as tar:
                for model_file in glob.glob(checkpoint_file + ".*"):
                    tar.add(model_file, arcname=os.path.basename(model_file))
                tar.add(
                    os.path.join(FLAGS.train_dir, "model_flags.json"),
                    arcname="model_flags.json",
                )
            print("Tarred model onto " + FLAGS.output_model_tgz)
        with tf.device("/cpu:0"):
            saver = tf.train.import_meta_graph(meta_graph_location,
                                               clear_devices=False)

        logging.info("restoring variables from " + checkpoint_file)
        saver.restore(sess, checkpoint_file)
        input_tensor = tf.get_collection("input_batch_raw")[0]
        num_frames_tensor = tf.get_collection("num_frames")[0]
        predictions_tensor = tf.get_collection("predictions")[0]

        # Workaround for num_epochs issue.
        def set_up_init_ops(variables):
            init_op_list = []
            for variable in list(variables):
                if "train_input" in variable.name:
                    init_op_list.append(tf.assign(variable, 1))
                    variables.remove(variable)
            init_op_list.append(tf.variables_initializer(variables))
            return init_op_list

        sess.run(
            set_up_init_ops(tf.get_collection_ref(
                tf.GraphKeys.LOCAL_VARIABLES)))

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        num_examples_processed = 0
        start_time = time.time()
        out_file.write("VideoId,LabelConfidencePairs\n")
        whitelisted_cls_mask = None
        if FLAGS.segment_labels:
            final_out_file = out_file
            out_file = tempfile.NamedTemporaryFile()
            logging.info(
                "Segment temp prediction output will be written to temp file: %s",
                out_file.name,
            )
            if FLAGS.segment_label_ids_file:
                whitelisted_cls_mask = np.zeros(
                    (predictions_tensor.get_shape()[-1], ), dtype=np.float32)
                segment_label_ids_file = FLAGS.segment_label_ids_file
                if segment_label_ids_file.startswith("http"):
                    logging.info(
                        "Retrieving segment ID whitelist files from %s...",
                        segment_label_ids_file,
                    )
                    segment_label_ids_file, _ = urllib.request.urlretrieve(
                        segment_label_ids_file)
                with tf.io.gfile.GFile(segment_label_ids_file) as fobj:
                    for line in fobj:
                        try:
                            cls_id = int(line)
                            whitelisted_cls_mask[cls_id] = 1.0
                        except ValueError:
                            # Simply skip the non-integer line.
                            continue

        out_file.write(u"VideoId,LabelConfidencePairs\n".encode("utf8"))

        try:
            while not coord.should_stop():
                video_id_batch_val, video_batch_val, num_frames_batch_val = sess.run(
                    [video_id_batch, video_batch, num_frames_batch])
                if FLAGS.segment_labels:
                    results = get_segments(video_batch_val,
                                           num_frames_batch_val, 5)
                    video_segment_ids = results["video_segment_ids"]
                    video_id_batch_val = video_id_batch_val[
                        video_segment_ids[:, 0]]
                    video_id_batch_val = np.array([
                        "%s:%d" % (x.decode("utf8"), y) for x, y in zip(
                            video_id_batch_val, video_segment_ids[:, 1])
                    ])
                    video_batch_val = results["video_batch"]
                    num_frames_batch_val = results["num_frames_batch"]
                    if input_tensor.get_shape()[1] != video_batch_val.shape[1]:
                        raise ValueError(
                            "max_frames mismatch. Please re-run the eval.py "
                            "with correct segment_labels settings.")

                predictions_val, = sess.run(
                    [predictions_tensor],
                    feed_dict={
                        input_tensor: video_batch_val,
                        num_frames_tensor: num_frames_batch_val,
                    },
                )
                now = time.time()
                num_examples_processed += len(video_batch_val)
                # num_classes = predictions_val.shape[1]
                elapsed_time = now - start_time
                logging.info("num examples processed: " +
                             str(num_examples_processed) +
                             " elapsed seconds: " +
                             "{0:.2f}".format(elapsed_time) +
                             " examples/sec: %.2f" %
                             (num_examples_processed / elapsed_time))
                for line in format_lines(video_id_batch_val, predictions_val,
                                         top_k, whitelisted_cls_mask):
                    out_file.write(line)
                out_file.flush()

        except tf.errors.OutOfRangeError:
            logging.info(
                "Done with inference. The output file was written to " +
                out_file_location)
        finally:
            coord.request_stop()

            if FLAGS.segment_labels:
                # Re-read the file and do heap sort.
                # Create multiple heaps.
                logging.info("Post-processing segment predictions...")
                heaps = {}
                out_file.seek(0, 0)
                for line in out_file:
                    segment_id, preds = line.decode("utf8").split(",")
                    if segment_id == "VideoId":
                        # Skip the headline.
                        continue
                    preds = preds.split(" ")
                    pred_cls_ids = [
                        int(preds[idx]) for idx in range(0, len(preds), 2)
                    ]
                    pred_cls_scores = [
                        float(preds[idx]) for idx in range(1, len(preds), 2)
                    ]
                    for cls, score in zip(pred_cls_ids, pred_cls_scores):
                        if not whitelisted_cls_mask[cls]:
                            # Skip non-whitelisted classes.
                            continue
                        if cls not in heaps:
                            heaps[cls] = []
                        if len(heaps[cls]) >= FLAGS.segment_max_pred:
                            heapq.heappushpop(heaps[cls], (score, segment_id))
                        else:
                            heapq.heappush(heaps[cls], (score, segment_id))
                logging.info("Writing sorted segment predictions to: %s",
                             final_out_file.name)
                final_out_file.write("Class,Segments\n")
                for cls, cls_heap in heaps.items():
                    cls_heap.sort(key=lambda x: x[0], reverse=True)
                    final_out_file.write(
                        "%d,%s\n" % (cls, " ".join([x[1] for x in cls_heap])))
                final_out_file.close()

            out_file.close()

        coord.join(threads)
        sess.close()
Example #35
0
def read_lyrics(lyrics_dir='lyrics_en',
                artist=None,
                album=None,
                print_stats=False,
                language='en-us',
                lookback=15):
    '''
    Read lyrics and compute Rhyme factor (riimikerroin) for each
    artist.

    Input:
        lyrics_dir  Path to the directory containing the lyrics.
        artist      Name of the artist directory under lyrics_dir (if this is
                    not provided, all artists are analyzed).
        album       Name of the album directory under lyrics_dir/artist/
        print_stats Whether we print summary statistics for each individual
                    song.
        language    Use either Finnish (fi), American English (en-us), 
                    or English (en).
        lookback    How many previous words are checked for rhymes. For
                    Finnish I've used 10 and for English 15.
    '''

    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    print(os.getcwd())

    if artist is not None:
        artists = [artist]
    else:
        artists = os.listdir(lyrics_dir)
    artist_scores = []
    song_scores = []
    song_names = []
    uniq_words = []
    longest_rhymes = []
    max_rhymes = 5
    for a in artists:
        print("Analyzing artist: %s" % a)
        uwc = []
        rls = []
        all_words = []
        if album is not None:
            albums = [album]
        else:
            albums = os.listdir(lyrics_dir + '/' + a)
            albums = sort_albums_by_year(albums)
        for al in albums:
            album_rls = []
            songs = os.listdir(lyrics_dir + '/' + a + '/' + al)
            # Only the .txt files
            songs = [s for s in songs if len(s) >= 4 and s[-4:] == '.txt']
            for song in songs:
                file_name = lyrics_dir + '/' + a + '/' + al + '/' + song
                l = Lyrics(file_name,
                           print_stats=print_stats,
                           language=language,
                           lookback=lookback)
                rl = l.get_avg_rhyme_length()
                rls.append(rl)
                song_scores.append(rl)
                song_names.append(file_name)
                album_rls.append(rl)
                if len(longest_rhymes) < max_rhymes:
                    heapq.heappush(longest_rhymes, l.get_longest_rhyme())
                else:
                    heapq.heappushpop(longest_rhymes, l.get_longest_rhyme())

                if language == 'fi':
                    all_words += l.text.split()
                else:
                    text = l.text_orig.lower()
                    rx = re.compile(u'[^\wåäö]+')
                    text = rx.sub(' ', text)
                    all_words += text.split()
                    num_uniq_words = len(set(text.split()))
                    print("number of unique words:", num_uniq_words)
                    uwc.append(num_uniq_words)
            # Print stats for the album
            #print "%s - %s: %.3f" % (a, al, np.mean(np.array(album_rls)))
            #print "%.5f" % (np.mean(np.array(album_rls)))

        # Compute the number of unique words the artist has used
        n_words = len(all_words)
        min_w = 20000
        if n_words:
            n_uniq_words = len(set(all_words[:min_w]))
            print("Unique ")
            uniq_words.append(n_uniq_words)
        else:
            uniq_words.append(-n_words)
        mean_rl = np.mean(np.array(rls))
        artist_scores.append(mean_rl)
        print("Overall Unique Word Count: " + a, np.mean(np.array(uwc)))

    # Sort the artists based on their avg rhyme lengths
    artist_scores = np.array(artist_scores)
    artists = np.array(artists)
    uniq_words = np.array(uniq_words)
    order = np.argsort(artist_scores)[::-1]
    artists = artists[order]
    uniq_words = uniq_words[order]
    artist_scores = artist_scores[order]

    print("\nBest rhymes")
    while len(longest_rhymes) > 0:
        l, rhyme = heapq.heappop(longest_rhymes)
        print(rhyme)

    print("\nBest songs:")
    song_scores = np.array(song_scores)
    song_names = np.array(song_names)
    song_names = song_names[np.argsort(song_scores)[::-1]]
    song_scores = sorted(song_scores)[::-1]
    for i in range(min(10, len(song_scores))):
        print('%.3f\t%s' % (song_scores[i], song_names[i]))

    print("\nBest artists:")
    for i in range(len(artist_scores)):
        rx = re.compile(u'_')
        name = rx.sub(' ', artists[i])
        print('%d.\t%.3f\t%s' % (i + 1, artist_scores[i], name))

    print("\nUnique Words:")
    for i in range(len(artist_scores)):
        rx = re.compile(u'_')
        name = rx.sub(' ', artists[i])
        print('%.1f\t%s' % (uniq_words[i], name))
Example #36
0
 def pushpop(self, item):
     return heapq.heappushpop(self, MaxHeapObj(item)).val
Example #37
0
 def pushpop(self, item):
     return heapq.heappushpop(self, item)
Example #38
0
def do_inference_brute_force(ps, L, M, unary_params, pw_params, unary_features, pw_features,
                             y_true=None, y_true_list=None, debug=False, top=5):
    """
    Inference using brute force search (for sanity check), could be:
    - Train/prediction inference for single-label SSVM
    - Train/prediction inference for multi-label SSVM
    """
    assert(L > 1)
    assert(L <= M)
    assert(ps >= 0)
    assert(ps < M)
    assert(top > 0)
    if y_true is not None:
        assert(y_true_list is not None and type(y_true_list) == list)
    if y_true is not None:
        top = 1

    Cu = np.zeros(M, dtype=np.float)       # unary_param[p] x unary_features[p]
    Cp = np.zeros((M, M), dtype=np.float)  # pw_param[pi, pj] x pw_features[pi, pj]
    # a intermediate POI should NOT be the start POI, NO self-loops
    for pi in range(M):
        Cu[pi] = np.dot(unary_params[pi, :], unary_features[pi, :])   # if pi != ps else -np.inf
        for pj in range(M):
            Cp[pi, pj] = -np.inf if (pj == ps or pi == pj) else np.dot(pw_params[pi, pj, :], pw_features[pi, pj, :])

    Q = []
    for x in itertools.permutations([p for p in range(M) if p != ps], int(L - 1)):
        y = [ps] + list(x)
        score = 0

        if y_true is not None and np.any([np.all(np.array(y) == np.asarray(yj)) for yj in y_true_list]) is True:
            continue

        for j in range(1, L):
            score += Cp[y[j - 1], y[j]] + Cu[y[j]]
        if y_true is not None:
            score += np.sum(np.asarray(y) != np.asarray(y_true))

        if len(Q) < top:
            hq.heappush(Q, HeapTerm(score, np.array(y)))
        else:
            hq.heappushpop(Q, HeapTerm(score, np.array(y)))  # pop the smallest, then push

    results = []
    scores = []
    while len(Q) > 0:
        hterm = hq.heappop(Q)
        results.append(hterm.task)
        scores.append(hterm.priority)

    # reverse the order: smallest -> largest => largest -> smallest
    results.reverse()
    scores.reverse()

    if debug is True:
        for score, y in zip(scores, results):
            print(score, y)

    if y_true is not None:
        results = results[0]

    return results
Example #39
0
 def addNum(self, num: int) -> None:
     if len(self.small) == len(self.large):
         heappush(self.large, -heappushpop(self.small, -num))
     else:
         heappush(self.small, -heappushpop(self.large, num))
Example #40
0
def push_pqueue(queue, priority, value):
    if len(queue)>20:
       heapq.heappushpop(queue, (priority, value))
    else:
        heapq.heappush(queue, (priority, value))
Example #41
0
def sim_density_search(choices, rot_ret, queueSize, cur_results_dir):
    most_sim = []  # format: [(1.0 - density, (sx, sy))] for each choice

    size = rot_ret.template.size

    nIterations = 0
    for c in choices:
        cx, cy = c.size()
        nIterations += (cx - size + 1) * (cy - size + 1)

    print nIterations
    global_counter = 0
    for c_index, c in enumerate(choices):
        slice_easel_choice_path = os.path.join(cur_results_dir,
                                               'slot%d' % c_index)
        if not os.path.exists(slice_easel_choice_path):
            os.makedirs(slice_easel_choice_path)

        all_sim_choice = []  # format: [(1.0 - density, (sx, sy))]
        most_sim.append([])
        #density_data.append([])
        cx, cy = c.size()
        print('Easel %d: %d total locations' % (c_index, (cx - size + 1) *
                                                (cy - size + 1)))
        #print('Slot %d:\nc.size(): %s\nc.im.size: %s\n' % (c_index, str(c.size()), str(c.im.size)))
        #assert(c.size() == c.im.size)
        counter = 0
        #print cx,cy
        for sx in range(cx - size + 1):
            for sy in range(cy - size + 1):
                density_delta = rot_ret.ringwiseDensityDelta(
                    c.pix[sx:sx + size, sy:sy + size])
                res = 1.0 - density_delta

                all_sim_choice.append((res, (sx, sy)))

                if counter < queueSize:
                    hq.heappush(most_sim[c_index], (res, (sx, sy)))
                else:
                    hq.heappushpop(most_sim[c_index], (res, (sx, sy)))

                if global_counter % 50000 == 0:
                    print(
                        str(
                            round(100 *
                                  (float(global_counter) / nIterations), 2)) +
                        r'% done')

                #print(str(counter) + ': ' + str(most_sim['sp']))

                counter += 1
                global_counter += 1

        all_sim_choice.sort(reverse=True)
        most_sim[c_index] = sorted(most_sim[c_index], reverse=True)
        deltas, locs = tuple(zip(*most_sim[c_index]))

        # need to flip axes to make points
        point_locs = [(y, x) for (x, y) in locs]

        cutoff = 0.5 * (all_sim_choice[0][0] + all_sim_choice[-1][0])
        cutoff_index = queueSize - 1
        while all_sim_choice[cutoff_index][
                0] > cutoff and cutoff_index < counter:
            cutoff_index += 1

        cutoff_index += 1

        with open(os.path.join(slice_easel_choice_path, 'all_deltas.csv'),
                  'wb') as deltas_file:
            wr = csv.writer(deltas_file, delimiter=',')
            wr.writerows(all_sim_choice[:cutoff_index])

        # format: (cutoff, color)
        # cutoffs refer to fraction of distance between worst and best deltas in queue
        heatmap_input_alt = [(0.0, 'violet'), (0.5, 'indigo'), (0.8, 'blue'),
                             (0.95, 'green'), (0.97, 'yellow'),
                             (0.98, 'orange'), (0.99, 'red')]

        heatmap_input = [(0.0, 'violet'), (0.5, 'indigo'), (0.8, 'blue'),
                         (0.95, 'green'), (0.97, 'yellow'), (0.98, 'orange'),
                         (0.99, 'red')]

        cutoffs, colors = tuple(zip(*heatmap_input))

        heatmap_cutoffs = [
            co * deltas[0] + (1 - co) * deltas[-1] for co in cutoffs
        ]
        heatmap = {
            cutoff: color
            for cutoff, color in zip(heatmap_cutoffs, colors)
        }

        points_im = Image.new('RGB', (cy, cx), color='white')
        pointer = ImageDraw.Draw(points_im)

        end_index = 0
        for cutoff in reversed(heatmap_cutoffs[1:]):
            start_index = end_index
            while deltas[end_index] > cutoff:
                end_index += 1

            pointer.point(point_locs[start_index:end_index],
                          fill=heatmap[cutoff])

        final_color = heatmap[heatmap_cutoffs[0]]
        pointer.point(point_locs[end_index:], fill=final_color)

        slot_im = c.im.convert('RGB')
        overlay_im = Image.blend(points_im, slot_im, alpha=0.2)

        overlay_im.save(os.path.join(slice_easel_choice_path,
                                     'sim_deltas.png'))

    return [[tup[1] for tup in choice] for choice in most_sim]
Example #42
0
def extract_frames(video_file='None',
                   num_frames=100,
                   out_dir=None,
                   show_flow=False,
                   algo='flow',
                   keep_first_frame=False,
                   sub_clip=False,
                   start_seconds=None,
                   end_seconds=None,
                   prediction=True):
    """
    Extract frames from the given video file. 
    This function saves the wanted number of frames based on
    optical flow by default.
    Or you can save all the frames by providing `num_frames` = -1. 

    """

    if sub_clip and start_seconds is not None and end_seconds is not None:
        video_file = extract_subclip(video_file, start_seconds, end_seconds)

    video_name = os.path.splitext(os.path.basename(video_file))[0]
    video_name = video_name.replace(' ', '_')
    if out_dir is None:
        out_dir = os.path.splitext(video_file)[0]
    else:
        out_dir = os.path.join(out_dir, video_name)

    # add extracting methods to folder name
    out_dir = f"{out_dir}_{algo}"

    if not os.path.exists(out_dir):
        os.makedirs(out_dir, exist_ok=True)

    if algo == 'keyframes':
        out_dir = key_frames(video_file,
                             out_dir,
                             sub_clip=sub_clip,
                             start_seconds=start_seconds,
                             end_seconds=end_seconds)
        yield 100, f"Done. Frames located at {out_dir}."

    cap = cv2.VideoCapture(video_file)
    n_frames = int(cap.get(7))

    current_frame_number = int(cap.get(1))

    subtractor = cv2.createBackgroundSubtractorMOG2()
    keeped_frames = []

    ret, old_frame = cap.read()

    if keep_first_frame:
        # save the first frame
        out_frame_file = f"{out_dir}{os.sep}{video_name}_{current_frame_number:08}.png"
        cv2.imwrite(out_frame_file, old_frame)

    if num_frames < -1 or num_frames > n_frames:
        print(f'The video has {n_frames} number frames in total.')
        print('Please input a valid number of frames!')
        return
    elif num_frames == 1:
        print(f'Please check your first frame here {out_dir}')
        return
    elif num_frames > 2:
        # if save the first frame and the last frame
        # so to make the number of extract frames
        # as the user provided exactly
        if keep_first_frame:
            num_frames -= 1

    hsv = np.zeros_like(old_frame)
    hsv[..., 1] = 255

    # keeped frame index
    ki = 0

    while ret:

        frame_number = int(cap.get(1))
        ret, frame = cap.read()

        progress_percent = ((frame_number + 1) / n_frames) * 100

        if num_frames == -1:
            out_frame_file = f"{out_dir}{os.sep}{video_name}_{frame_number:08}.png"
            if ret:
                cv2.imwrite(out_frame_file, frame)
                print(f'Saved the frame {frame_number}.')
            continue
        if algo == 'random' and num_frames != -1:
            if ret:
                if len(keeped_frames) < num_frames:
                    keeped_frames.append((ki, frame_number, frame))
                else:
                    j = random.randrange(ki + 1)
                    if j < num_frames:
                        keeped_frames[j] = ((j, frame_number, frame))
                ki += 1
                yield progress_percent, f'Processed {ki} frames.'
            continue
        if algo == 'flow' and num_frames != -1:
            mask = subtractor.apply(frame)
            old_mask = subtractor.apply(old_frame)

            out_frame = cv2.bitwise_and(frame, frame, mask=mask)
            old_out_frame = cv2.bitwise_and(old_frame,
                                            old_frame,
                                            mask=old_mask)
            try:
                out_frame = cv2.cvtColor(out_frame, cv2.COLOR_BGR2GRAY)
                old_out_frame = cv2.cvtColor(old_out_frame, cv2.COLOR_BGR2GRAY)

                flow = cv2.calcOpticalFlowFarneback(old_out_frame, out_frame,
                                                    None, 0.5, 3, 15, 3, 5,
                                                    1.2, 0)

                if show_flow:
                    mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])
                    hsv[..., 0] = ang * 180 / np.pi / 2
                    hsv[..., 2] = cv2.normalize(mag, None, 0, 255,
                                                cv2.NORM_MINMAX)
                    rgb = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)

                q_score = int(np.abs(np.sum(flow.reshape(-1))))
                progress_msg = f"precessing frame {frame_number},the diff to prev frame is {q_score}."

                if len(keeped_frames) < num_frames:
                    heapq.heappush(keeped_frames,
                                   ((q_score, frame_number, frame)))
                else:
                    heapq.heappushpop(keeped_frames,
                                      ((q_score, frame_number, frame)))

                if show_flow:
                    cv2.imshow("Frame", rgb)
                yield progress_percent, progress_msg
            except:
                print('skipping the current frame.')

            old_frame = frame
        key = cv2.waitKey(1)
        if key == 27:
            break

    for kf in keeped_frames:
        s, f, p = kf
        out_img = f"{out_dir}{os.sep}{video_name}_{f:08}_{s}.png"
        cv2.imwrite(out_img, p)
        # default mask rcnn prediction if select less than 5 frames
        if prediction and num_frames <= 5:
            try:
                inference.predict_mask_to_labelme(out_img, 0.7)
            except:
                print("Please install pytorch and torchvision as follows.")
                print(
                    "pip install torch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0"
                )
                pass

    cap.release()
    cv2.destroyAllWindows()
    yield 100, f"Done. Frames located at {out_dir}"
Example #43
0
# 	length = len(result)
# 	if length % 2 != 0:
# 		print(result[length // 2])
# 	else :
# 		print(min(result[length // 2], result[length// 2 - 1]))

mid = 0
for count in range(n):  # Nlog(N) -> 통과
    number = int(input())
    if count == 0:
        mid = number
    else:
        if mid < number:
            if len(right_heap) > len(left_heap):
                heapq.heappush(left_heap, (-mid, mid))
                mid = heapq.heappushpop(right_heap, (number, number))[1]
            elif len(right_heap) < len(left_heap):
                heapq.heappush(right_heap, number)
                mid = heapq.heappushpop(left_heap, (-mid, mid))[1]
            else:  # equal
                heapq.heappush(right_heap, (number, number))
                mid = heapq.heappushpop(left_heap, (-mid, mid))[1]
        else:
            if len(right_heap) > len(left_heap):
                heapq.heappush(left_heap, (-number, number))
                mid = heapq.heappushpop(right_heap, (mid, mid))[1]
            elif len(right_heap) < len(left_heap):
                heapq.heappush(right_heap, (mid, mid))
                mid = heapq.heappushpop(left_heap, (-number, number))[1]
            else:  # equal
                heapq.heappush(right_heap, (mid, mid))
Example #44
0
 def update_event(self, inp=-1):
     self.set_output_val(0, heapq.heappushpop(self.input(0), self.input(1)))
Example #45
0
                                                 featureVectorSVM)

                classifier = SVC()

                scaler = StandardScaler()
                train_x = scaler.fit_transform(train_x)
                test_x = scaler.transform(test_x)
                clf.fit(train_x, train_y)
                accuracy = clf.score(test_x, test_y)

                average_accuracy_svm += accuracy / 4.0

                if (len(SVMheap) < 400):
                    heappush(SVMheap, (average_accuracy_svm, svm_curr_idx))
                else:
                    heappushpop(SVMheap, (average_accuracy_svm, svm_curr_idx))

                svm_curr_idx += 1

            #analyze quality of feature vector using Shinkareva's method
            # store (sh result, bandnum, t, channel) to results_shinkareva

            flattenedVector = featureVectorSh.flatten()
            segmentsToCompare = [
                flattenedVector[:110], flattenedVector[110:220],
                flattenedVector[220:330], flattenedVector[330:440],
                flattenedVector[440:550], flattenedVector[550:660],
                flattenedVector[660:770], flattenedVector[770:880]
            ]

            avgpearson = 0
Example #46
0
    def process_batch(self, queries, candidates=None, top_n=1, n_docs=5,
                      return_context=False):
        """Run a batch of queries (more efficient)."""
        t0 = time.time()
        logger.info('Processing %d queries......' % len(queries))
        logger.info('Retrieving top %d docs......' % n_docs)

        # Rank documents for queries.
        if len(queries) == 1:
            ranked = [self.ranker.closest_docs(queries[0], k=n_docs)]
        else:
            ranked = self.ranker.batch_closest_docs(
                queries, k=n_docs, num_workers=self.num_workers
            )
        all_docids, all_doc_scores = zip(*ranked)

        # Flatten document ids and retrieve text from database.
        # We remove duplicates for processing efficiency.
        flat_docids = list({d for docids in all_docids for d in docids})
        did2didx = {did: didx for didx, did in enumerate(flat_docids)}
        doc_texts = self.processes.map(fetch_text, flat_docids)

        # Split and flatten documents. Maintain a mapping from doc (index in
        # flat list) to split (index in flat list).
        flat_splits = []
        didx2sidx = []
        for text in doc_texts:
            splits = self._split_doc(text)
            didx2sidx.append([len(flat_splits), -1])
            for split in splits:
                flat_splits.append(split)
            didx2sidx[-1][1] = len(flat_splits)

        # Push through the tokenizers as fast as possible.
        q_tokens = self.processes.map_async(tokenize_text, queries)
        s_tokens = self.processes.map_async(tokenize_text, flat_splits)
        q_tokens = q_tokens.get()
        s_tokens = s_tokens.get()

        # Group into structured example inputs. Examples' ids represent
        # mappings to their question, document, and split ids.
        examples = []
        for qidx in range(len(queries)):
            for rel_didx, did in enumerate(all_docids[qidx]):
                start, end = didx2sidx[did2didx[did]]
                for sidx in range(start, end):
                    if (len(q_tokens[qidx].words()) > 0 and
                            len(s_tokens[sidx].words()) > 0):
                        examples.append({
                            'id': (qidx, rel_didx, sidx),
                            'question': q_tokens[qidx].words(),
                            'qlemma': q_tokens[qidx].lemmas(),
                            'document': s_tokens[sidx].words(),
                            'lemma': s_tokens[sidx].lemmas(),
                            'pos': s_tokens[sidx].pos(),
                            'ner': s_tokens[sidx].entities(),
                        })

        logger.info('Reading %d paragraphs......' % len(examples))

        # Push all examples through the document reader.
        # We decode argmax start/end indices asychronously on CPU.
        result_handles = []
        num_loaders = min(self. max_loaders, math.floor(len(examples) / 1e3))
        for batch in self._get_loader(examples, num_loaders):
            if candidates or self.fixed_candidates:
                batch_cands = []
                for ex_id in batch[-1]:
                    batch_cands.append({
                        'input': s_tokens[ex_id[2]],
                        'cands': candidates[ex_id[0]] if candidates else None
                    })
                handle = self.reader.predict(
                    batch, batch_cands, async_pool=self.processes
                )
            else:
                handle = self.reader.predict(batch, async_pool=self.processes)
            result_handles.append((handle, batch[-1], batch[0].size(0)))

        # Iterate through the predictions, and maintain priority queues for
        # top scored answers for each question in the batch.
        queues = [[] for _ in range(len(queries))]
        for result, ex_ids, batch_size in result_handles:
            s, e, score = result.get()
            for i in range(batch_size):
                # We take the top prediction per split.
                if len(score[i]) > 0:
                    item = (score[i][0], ex_ids[i], s[i][0], e[i][0])
                    queue = queues[ex_ids[i][0]]
                    if len(queue) < top_n:
                        heapq.heappush(queue, item)
                    else:
                        heapq.heappushpop(queue, item)

        # Arrange final top prediction data.
        all_predictions = []
        for queue in queues:
            predictions = []
            while len(queue) > 0:
                score, (qidx, rel_didx, sidx), s, e = heapq.heappop(queue)
                prediction = {
                    'doc_id': all_docids[qidx][rel_didx],
                    'span': s_tokens[sidx].slice(s, e + 1).untokenize(),
                    'doc_score': float(all_doc_scores[qidx][rel_didx]),
                    'span_score': float(score),
                }
                if return_context:
                    prediction['context'] = {
                        'text': s_tokens[sidx].untokenize(),
                        'start': s_tokens[sidx].offsets()[s][0],
                        'end': s_tokens[sidx].offsets()[e][1],
                    }
                predictions.append(prediction)
            all_predictions.append(predictions[-1::-1])

        logger.info('Processed %d queries in %.4f (s)......' %
                    (len(queries), time.time() - t0))

        return all_predictions
 def add(self, file_path, file_size):
     if len(self.largest_files) < self.total_files:
         heappush(self.largest_files, (file_size, file_path))
     else:
         heappushpop(self.largest_files, (file_size, file_path))
Example #48
0
import heapq
from sys import stdin

if __name__ == '__main__':
    heap = []
    heap_aux = []
    len_p = int(stdin.readline())
    for x in range(len_p):
        a = int(stdin.readline())
        if not heap:
            heapq.heappush(heap, a)
        else:
            if len(heap) > len(heap_aux):
                if heap[0] < a:
                    b = heapq.heappushpop(heap, a)
                    heapq.heappush(heap_aux, -b)
                else:
                    heapq.heappush(heap_aux, -a)
            else:
                if -heap_aux[0] > a:
                    b = -heapq.heappushpop(heap_aux, -a)
                    heapq.heappush(heap, b)
                else:
                    heapq.heappush(heap, a)
        if len(heap) > len(heap_aux):
            print("%.1f" % heap[0])
        else:
            print("%.1f" % ((heap[0] - heap_aux[0]) / 2))
Example #49
0
    def process_batch(self, queries, top_n=1, n_docs=5, return_context=False):
        """Run a batch of queries (more efficient)."""
        t3 = time.time()
        logger.info('Processing %d queries...' % len(queries))
        logger.info('Retrieving top %d docs...' % n_docs)

        # Rank documents for queries.
        if len(queries) == 1:
            ranked = [self.ranker.closest_docs(queries[0], k=n_docs)]
        else:
            ranked = self.ranker.batch_closest_docs(
                queries, k=n_docs, num_workers=self.num_workers)

        t4 = time.time()
        logger.info('docs retrieved [time]: %.4f s' % (t4 - t3))
        all_docids, all_doc_scores, all_doc_texts = zip(*ranked)

        # Flatten document ids and retrieve text from database.
        # We remove duplicates for processing efficiency.
        flat_docids, flat_doc_texts = zip(
            *{(d, t)
              for doc_ids, doc_texts in zip(all_docids, all_doc_texts)
              for d, t in zip(doc_ids, doc_texts)})

        # flat_docids = list({d for docids in all_docids for d in docids})
        did2didx = {did: didx for didx, did in enumerate(flat_docids)}
        # flat_doc_texts = list({t for doc_texts in all_doc_texts for t in doc_texts})
        # logger.info('doc_texts for top %d docs extracted' % n_docs)

        # Split and flatten documents. Maintain a mapping from doc (index in
        # flat list) to split (index in flat list).
        flat_splits = []
        didx2sidx = []
        for text in flat_doc_texts:
            splits = self._split_doc(text)
            didx2sidx.append([len(flat_splits), -1])
            for split in splits:
                flat_splits.append(split)
            didx2sidx[-1][1] = len(flat_splits)
        t5 = time.time()
        # logger.debug('doc_texts flattened')

        # Push through the tokenizers as fast as possible.
        q_tokens = self.processes.map_async(tokenize_text, queries)
        s_tokens = self.processes.map_async(tokenize_text, flat_splits)
        q_tokens = q_tokens.get()
        s_tokens = s_tokens.get()
        # logger.info('q_tokens: %s' % q_tokens)
        # logger.info('s_tokens: %s' % s_tokens)
        t6 = time.time()
        logger.info('doc texts tokenized [time]: %.4f s' % (t6 - t5))

        # Group into structured example inputs. Examples' ids represent
        # mappings to their question, document, and split ids.
        examples = []
        for qidx in range(len(queries)):
            q_text = q_tokens[qidx].words()
            para_lens = []
            for rel_didx, did in enumerate(all_docids[qidx]):
                start, end = didx2sidx[did2didx[did]]
                for sidx in range(start, end):
                    para_text = s_tokens[sidx].words()
                    if len(q_text) > 0 and len(para_text) > 0:
                        examples.append({
                            'id': (qidx, rel_didx, sidx),
                            'question':
                            q_text,
                            # 'qlemma': q_tokens[qidx].lemmas(),
                            'document':
                            para_text,
                            'document_char':
                            s_tokens[sidx].chars(),
                            'question_char':
                            q_tokens[qidx].chars(),
                            # 'lemma': s_tokens[sidx].lemmas(),
                            # 'pos': s_tokens[sidx].pos(),
                            # 'ner': s_tokens[sidx].entities(),
                            'doc_score':
                            float(all_doc_scores[qidx][rel_didx])
                        })
                        # r = {'w': para_text}
                        # f = open('/tmp/data.json', 'w')
                        # f.write(json.dumps(r))
                        # f.close()
                        # exit(0)
                        para_lens.append(len(s_tokens[sidx].words()))
            # logger.debug('question_p: %s paragraphs: %s' % (queries[qidx], para_lens))
        t7 = time.time()
        logger.info('paragraphs prepared [time]: %.4f s' % (t7 - t6))

        result_handles = []
        num_loaders = min(self.max_loaders,
                          int(math.floor(len(examples) / 1e3)))
        for batch in self._get_loader(examples, num_loaders):
            handle = self.reader.predict(batch, async_pool=self.processes)
            result_handles.append((handle, batch[-1], batch[0].size(0)))

        t8 = time.time()
        logger.info('paragraphs predicted [time]: %.4f s' % (t8 - t7))

        # Iterate through the predictions, and maintain priority queues for
        # top scored answers for each question in the batch.
        queues = [[] for _ in range(len(queries))]
        for result, ex_ids, batch_size in result_handles:
            s, e, score = result.get()
            for i in range(batch_size):
                # We take the top prediction per split.
                if len(score[i]) > 0:
                    item = (score[i][0], ex_ids[i], s[i][0], e[i][0])
                    queue = queues[ex_ids[i][0]]
                    if len(queue) < top_n:
                        heapq.heappush(queue, item)
                    else:
                        heapq.heappushpop(queue, item)

        logger.info('answers processed...')
        # Arrange final top prediction data.
        all_predictions = []
        for queue in queues:
            predictions = []
            while len(queue) > 0:
                score, (qidx, rel_didx, sidx), s, e = heapq.heappop(queue)
                prediction = {
                    'doc_id': all_docids[qidx][rel_didx],
                    'start': int(s),
                    'end': int(e),
                    'span': s_tokens[sidx].slice(s, e + 1).untokenize(),
                    'doc_score': float(all_doc_scores[qidx][rel_didx]),
                    'span_score': float(score)
                }
                if return_context:
                    prediction['context'] = {
                        'text': s_tokens[sidx].untokenize(),
                        'start': s_tokens[sidx].offsets()[s][0],
                        'end': s_tokens[sidx].offsets()[e][1],
                    }
                predictions.append(prediction)
            all_predictions.append(predictions[-1::-1])

        logger.info('%d queries processed [time]: %.4f s' %
                    (len(queries), time.time() - t3))

        return all_predictions
Example #50
0
#힙(Heap)자료구조

힙은 최대힙과 최소힙이 있다.
최대힙-모든 부모 노드값이 자식노드의 값보다 크거나 같은 값을 갖는다.
최소힙-모든 부모 노드값이 자식노드의 값보다 작거나 같은 값을 갖는다.
부모 인덱스 x, 왼쪽자식 인덱스 2x, 오른쪽 자식 인덱스 2x+1

힙의 특징을 이용하면 우선순위 큐를 구현할수있다.

스택-가장 마지막에 들어온 자료를 먼저 꺼낸다.
큐-가장 먼저 들어온 자료를 먼저 꺼낸다.
우선순위 큐-가장 우선순위가 높은 값을 먼저 꺼낸다.
=>즉, 최댓값, 최솟값을 찾아내는 연산을 빠르게 하기 위해 고안된 자료구조


heap의 흥미로운 특성은 가장 작은 요소가 항상 루트인 heap[0]

import heapq (as hq)
heapq.heappush(a, n) : #heap불변성을 유지하면서 n을 a리스트에 push
heapq.heappop(a) : #heap불변성을 유지하면서 heap에서 가장 작은 항목을 pop하고 반환
#heap이 비어있으면 IndexError가 발생, pop하지 않고 가장 작은 항목에 접근시 heap[0]

heapq.heappushpop(a, n) : #heap에 n을 push한 다음 heap에서 가장 작은 항목을 pop해서 반환
별도 실행하는 것보다 효율적이다.

heapq.heapify(a) #리스트 a를 힙으로 변환

heap.heapify(a)를 하면 a가 오름차순으로 정렬되있는건 아니다.

Example #51
0
def main():
    fraglen = 1000000

    parser = argparse.ArgumentParser(description='A memory usage estimator.')
    parser.add_argument('-i',
                        '--input',
                        metavar='BAM',
                        action='append',
                        help='Input BAM file name, required',
                        required=True)
    parser.add_argument('-t',
                        '--thread_count',
                        metavar='NUM',
                        dest='nthr',
                        help='Number of threads, required',
                        type=int,
                        required=True)
    parser.add_argument('-f',
                        '--frag_len',
                        metavar='LEN',
                        dest='fraglen',
                        help='Fragment length, default %d' % fraglen,
                        type=int,
                        default=fraglen)
    parser.add_argument('-q',
                        '--qcal',
                        action='store_true',
                        help='Whether qcal correction will be applied')
    parser.add_argument('-d',
                        '--densest',
                        metavar='N',
                        help='List the densest N fragments',
                        type=int,
                        default=0)
    parser.add_argument(
        'shard',
        nargs='*',
        help='A list of shards. Each shard may be a comma seperated list of '
        'genomic locations, default one shard per chromosome')
    args = parser.parse_args()

    chroms = {}
    density = {}
    for bamf in args.input:
        hdr = BAMHeader(bamf)
        idx = BAMIndex(bamf)
        for (tchr, tlen), (size, dens) in zip(hdr.chrs, idx.density()):
            d = density.setdefault(tchr, [])
            if len(d) < len(dens):
                d.extend([0] * (len(dens) - len(d)))
            for i in xrange(len(dens)):
                d[i] += dens[i]
            chroms[tchr] = max(chroms.get(tchr, 0), tlen)
    chr_tid = dict((chr[0], i) for i, chr in enumerate(hdr.chrs))

    multiplier = 4 * 2.0
    if args.qcal:
        multiplier *= 2
    fraglen = args.fraglen
    nthr = args.nthr
    if len(args.shard) > 0:
        shards = map(parse_shard, args.shard)
    else:
        shards = [([(c, 0, n)], c) for c, n in chroms.iteritems()]
    try:
        shards.sort(key=lambda shard: chr_tid[shard[1]])
    except KeyError:
        shards.sort(key=lambda shard: shard[1])
    nmax = [(0, None, 0)] * args.densest
    for shard, name in shards:
        dmax = [0] * nthr
        for c, s, e in shard:
            if c not in density:
                continue
            for f in xrange(s, e, fraglen):
                i = f / 16384
                j = (f + fraglen + 16383) / 16384
                for k in xrange(i, j):
                    if k >= len(density[c]):
                        break
                    d = density[c][k]
                    heapq.heappushpop(dmax, d)
                    heapq.heappushpop(nmax, (d, c, k * 16384))
        d = sum(dmax) * multiplier / (1024 * 1024 * 1024)
        print '%s\t%f' % (name, d)

    if args.densest > 0:
        print '\nThe densest %d fragments:' % args.densest
        for d, c, s in heapq.nlargest(args.densest, nmax):
            if c is None: continue
            frag = '%s:%d-%d' % (c, s + 1, s + 16384)
            print '%-32s%f' % (frag, d / (1024 * 1024 * 1024.))
Example #52
0
heap = []

# heapq.heapify(list) 将 数据转化为堆 ,在list 的基础上直接堆化
heapq.heapify(arr)

# heapq.heappush(heap,item), 往堆中增加元素,增加完了之后还是一个堆
heapq.heappush(arr, 5)

# heapq.heappop(heap) ,删除最小元素
a1 = heapq.heappop(arr)

# heapq.heapreplace(heap, item) ,删除最小元素,之后在添加新元素item
a2 = heapq.heapreplace(arr, 8)

# heapq.heappushpop(heap, item) 首先判断添加元素值与堆的第一个元素之对比,如果大则删除第一个元素,然后添加新的元素值,否则不更改
a3 = heapq.heappushpop(arr, 5)

# heapq.merge(iterables) 将多个堆合并,必须是堆才行,否则得出可能不是最小堆
arr1 = [1, 2, 4]
arr2 = [4, 3, 1]
heapq.heapify(arr1)
heapq.heapify(arr2)
a4 = list(heapq.merge(arr, arr1, arr2))

# heapq.nlargest(n, heap) 查询堆中最大的n个元素
a5 = heapq.nlargest(3, arr)

# heapq.nsmallest(n, heap)
a6 = heapq.nsmallest(3, arr)

print()
Example #53
0
def AutoRecommend(tp_cards, junks, pairs, triples, booms, straights, flushs,
                  _2_pairs, _32_tps):
    _third = booms + _32_tps + flushs + straights + triples + _2_pairs + pairs + junks
    _second = booms + _32_tps + flushs + straights + triples + _2_pairs + pairs + junks
    _first = triples + pairs + junks
    nw_cards = tp_cards.copy()
    nw_cards.sort(key=lambda x: -x[1])
    #print(nw_cards)
    q = []
    hyper_n = 20
    my_weight = [0.2, 0.3, 0.5]
    heapq.heapify(q)

    for i in _third:

        #print("i = ",i)
        nwcs = nw_cards.copy()
        #print("nwcs0 ", nwcs)

        tail = i.copy()
        for ii in tail:
            nwcs.remove(ii)

        tp_nwcs0 = nwcs.copy()
        #print("nwcs after ii ", nwcs)

        for j in _second:
            nwcs = tp_nwcs0.copy()
            #print("j = ",j)
            mid = j.copy()
            flg = 1
            for jj in mid:
                if (jj in nwcs):
                    nwcs.remove(jj)
                else:
                    flg = 0
                    break
            if (flg == 0):
                continue

            tp_nwcs1 = nwcs.copy()
            #print("nwcs after jj ", nwcs)
            for k in _first:
                nwcs = tp_nwcs1.copy()
                #print("k = ",k)
                head = k.copy()
                flg = 1
                for kk in head:
                    if (kk in nwcs):
                        nwcs.remove(kk)
                    else:
                        flg = 0
                        break
                if (flg == 0):
                    continue

                #print("nwcs the rest ", nwcs)
                #print("nw_cards_0 = ", head, mid, tail)
                #complete the head
                pos = 0
                while ((len(head) < 3) and (pos < len(nwcs))):
                    head += [nwcs[pos]]
                    pos += 1

                #complete the mid
                while ((len(mid) < 5) and (pos < len(nwcs))):
                    mid += [nwcs[pos]]
                    pos += 1

                #complete the last
                while ((len(tail) < 5) and (pos < len(nwcs))):
                    tail += [nwcs[pos]]
                    pos += 1

                w_h, w_m, w_t = get_weight(head, 0), get_weight(mid,
                                                                1), get_weight(
                                                                    tail, 2)
                nw_w = (np.array([w_h, w_m, w_t]) * np.array(my_weight)).sum()
                #print("nw_cards_1 = ", head, mid, tail)
                chk_val = chk_ordered(head, mid, tail)
                if (chk_val[0] == 1):
                    if (len(q) < hyper_n):
                        heapq.heappush(q, HandCard(head + mid + tail, nw_w))
                    else:
                        if (nw_w > q[0].weight):
                            heapq.heappushpop(
                                q, HandCard(head + mid + tail, nw_w))

    result_cards = []
    while (len(q) > 0):
        result_cards.append(heapq.heappop(q))
    ret = result_cards[len(result_cards) - 1].list
    return ret
Example #54
0
                markerType=cv2.MARKER_TILTED_CROSS,
                thickness=2)
 error = False
 while True:
     step += 1
     old = (int(loc[0].item()), int(loc[1].item()))
     try:
         optimizer.zero_grad()
         value = AutogradFn.apply(fn, loc)
     except:
         # break the loop with error
         error = True
         break
     # push value on heap
     if len(best_n) >= n:
         worst = heapq.heappushpop(best_n, -value)
         if (-value) == worst:
             # break the loop
             break
     else:
         heapq.heappush(best_n, -value)
     value.backward()
     optimizer.step()
     new = (int(loc[0].item()), int(loc[1].item()))
     # Visualize each iteration by drawing on vis
     cv2.line(vis, old, new, (255, 255, 255), 2)
     cv2.imshow('Progress', vis)
     cv2.waitKey(50)  # 20 fps, tune according to your liking
 # mark final point if no error
 if not error:
     cv2.drawMarker(vis,
Example #55
0
    def process_batch(self,
                      queries,
                      candidates=None,
                      top_n=1,
                      n_docs=5,
                      context=None,
                      data=None):
        """Run a batch of queries (more efficient)."""
        def get_page_numbers(all_page_numbers_, query_number_, rel_didx_):
            if len(all_page_numbers_
                   ) >= query_number_ and all_page_numbers_[query_number_]:
                return all_page_numbers_[query_number_][rel_didx_]
            else:
                return 0

        t0 = time.time()
        logger.info("Processing %d queries..." % len(queries))
        logger.info("Retrieving top %d docs..." % n_docs)

        if not context:
            context = {"return": False, "window": None}

        # Rank documents for queries.
        if len(queries) == 1:

            id_token = data["id_token"] if data and "id_token" in data else ""
            access_token = data[
                "access_token"] if data and "access_token" in data else ""

            if id_token or access_token:
                ranked = [
                    self.ranker.closest_docs(queries[0],
                                             k=n_docs,
                                             id_token=id_token,
                                             access_token=access_token)
                ]
            else:
                ranked = [self.ranker.closest_docs(queries[0], k=n_docs)]
        else:
            ranked = self.ranker.batch_closest_docs(
                queries, k=n_docs, num_workers=self.num_workers)
        all_doc_ids, all_doc_scores, all_page_numbers = zip(*ranked)

        # Flatten document ids and retrieve text from database.
        # We remove duplicates for processing efficiency.
        flat_doc_ids = list({d for doc_ids in all_doc_ids for d in doc_ids})
        doc_id2doc_index = {
            doc_id: doc_index
            for doc_index, doc_id in enumerate(flat_doc_ids)
        }

        if self.ranker.name in ["elastic", "custom"]:
            # HACK, as cannot pickle thread-locked objects
            doc_texts = [
                self.ranker.get_doc_text(flat_docid)
                for flat_docid in flat_doc_ids
            ]
        else:
            doc_texts = self.processes.map(fetch_text, flat_doc_ids)

        # Split and flatten documents. Maintain a mapping from doc (index in
        # flat list) to split (index in flat list).
        flat_splits = []
        doc_index2split_index = []
        for text in doc_texts:

            doc_index2split_index.append([len(flat_splits), -1])
            for split in self._split_doc(text, self.ranker, queries, data):
                flat_splits.append(split)
            doc_index2split_index[-1][1] = len(flat_splits)

        # Push through the tokenizers as fast as possible.
        q_tokens = self.processes.map_async(tokenize_text, queries)
        s_tokens = self.processes.map_async(tokenize_text, flat_splits)
        q_tokens = q_tokens.get()
        s_tokens = s_tokens.get()

        # Group into structured example inputs. Examples' ids represent
        # mappings to their question, document, and split ids.
        examples = []
        for query_number in range(len(queries)):
            for rel_didx, document_id in enumerate(all_doc_ids[query_number]):

                start, end = doc_index2split_index[
                    doc_id2doc_index[document_id]]

                for sidx in range(start, end):
                    if len(q_tokens[query_number].words()) > 0 and len(
                            s_tokens[sidx].words()) > 0:
                        examples.append({
                            "id": (query_number, rel_didx, sidx),
                            "question":
                            q_tokens[query_number].words(),
                            "qlemma":
                            q_tokens[query_number].lemmas(),
                            "document":
                            s_tokens[sidx].words(),
                            "lemma":
                            s_tokens[sidx].lemmas(),
                            "pos":
                            s_tokens[sidx].pos(),
                            "ner":
                            s_tokens[sidx].entities(),
                        })

        logger.info("Reading %d paragraphs..." % len(examples))

        # Push all examples through the document reader.
        # We decode argmax start/end indices asychronously on CPU.
        result_handles = []
        num_loaders = min(self.max_loaders, math.floor(len(examples) / 1e3))
        for batch in self._get_loader(examples, num_loaders):
            if candidates or self.fixed_candidates:
                batch_cands = []
                for ex_id in batch[-1]:
                    batch_cands.append({
                        "input":
                        s_tokens[ex_id[2]],
                        "cands":
                        candidates[ex_id[0]] if candidates else None
                    })
                handle = self.reader.predict(batch,
                                             batch_cands,
                                             async_pool=self.processes)
            else:
                handle = self.reader.predict(batch, async_pool=self.processes)
            result_handles.append((handle, batch[-1], batch[0].size(0)))

        # Iterate through the predictions, and maintain priority queues for
        #         # top scored answers for each question in the batch.
        queues = [[] for _ in range(len(queries))]
        for result, ex_ids, batch_size in result_handles:
            s, e, score = result.get()
            for i in range(batch_size):
                # We take the top prediction per split.
                if len(score[i]) > 0:
                    item = (score[i][0], ex_ids[i], s[i][0], e[i][0])
                    queue = queues[ex_ids[i][0]]
                    if len(queue) < top_n:
                        heapq.heappush(queue, item)
                    else:
                        heapq.heappushpop(queue, item)

        # Arrange final top prediction data.
        all_predictions = []

        for queue in queues:
            predictions = []
            while len(queue) > 0:
                score, (query_number, rel_didx,
                        sidx), s, e = heapq.heappop(queue)
                prediction = {
                    "doc_id":
                    all_doc_ids[query_number][rel_didx],
                    "page_number":
                    get_page_numbers(all_page_numbers, query_number, rel_didx),
                    "span":
                    s_tokens[sidx].slice(s, e + 1).untokenize(),
                    "doc_score":
                    float(all_doc_scores[query_number][rel_didx]),
                    "span_score":
                    float(score),
                    "question":
                    queries[0]
                }
                if context["return"]:
                    prediction["context"] = {
                        "text": s_tokens[sidx].untokenize(),
                        "start": s_tokens[sidx].offsets()[s][0],
                        "end": s_tokens[sidx].offsets()[e][1],
                    }
                predictions.append(prediction)
            all_predictions.append(predictions[-1::-1])

        logger.info("Processed %d queries in %.4f (s)" %
                    (len(queries), time.time() - t0))

        return all_predictions
Example #56
0
def GetNearestElements(user_id, current_context, suggestees, k=10):
    """
    Returns k nearest neighbours of current_context in user_history.

    user_history is the output of ExtractFeatures.
    current_context is a feature vector with NUM_FEATURES elements.
    """

    if type(user_id) is int:
        user_history = ExtractFeatures(user_id)
    else:
        user_history = user_id
    user_interest = GetUserInterest(user_id, current_context, suggestees)

    neighbours = []
    counts = {}
    for entry in user_history:
        dist = GetDist(entry[1:], current_context)
        if dist > kMaxDistThreshold:
            continue
        if len(counts) < k:
            heapq.heappush(neighbours, (-dist, entry[0]))
            if entry[0] not in counts:
                counts[entry[0]] = 1
            else:
                counts[entry[0]] += 1
        elif dist < -neighbours[0][0]:
            _, smallest = heapq.heappushpop(neighbours, (-dist, entry[0]))
            if entry[0] not in counts:
                counts[entry[0]] = 1
            else:
                counts[entry[0]] += 1
            counts[smallest] -= 1
            if counts[smallest] == 0:
                del counts[smallest]

    # TODO(kadircet): Add data coming from cold start or maybe most liked N
    # elements into the base tags too.
    base_tags = GetTagWeights(counts.keys())
    similar_suggestees = GetSimilarSuggestees(
        None, base_tags=base_tags, similarity_metric=WeightedJaccardSimilarity)
    neighbours = []
    for suggestee_id, count in user_interest.items():
        history_count = counts.get(suggestee_id, 0)
        # If user simply disliked and never eaten it, abandon the choice.
        if history_count == 0 and count < 0:
            continue
        counts.pop(suggestee_id, 0)
        neighbours.append((history_count * kHistoryCoef + count, suggestee_id))
    for suggestee_id, history_count in counts.items():
        neighbours.append((history_count * kHistoryCoef, suggestee_id))
    max_count = max(max(neighbours)[0], 1)

    def CountsToProb(x):
        return (x[0] / max_count, x[1])

    neighbours = list(map(CountsToProb, neighbours))
    neighbours.extend(similar_suggestees)
    neighbours.sort()
    neighbours.reverse()

    return tuple(map(lambda x: int(x[1]), neighbours))[:20]
Example #57
0
    def explain(self, userid, user_items, itemid, user_weights=None, N=10):
        """ Provides explanations for why the item is liked by the user.

        Parameters
        ---------
        userid : int
            The userid to explain recommendations for
        user_items : csr_matrix
            Sparse matrix containing the liked items for the user
        itemid : int
            The itemid to explain recommendations for
        user_weights : ndarray, optional
            Precomputed Cholesky decomposition of the weighted user liked items.
            Useful for speeding up repeated calls to this function, this value
            is returned
        N : int, optional
            The number of liked items to show the contribution for

        Returns
        -------
        total_score : float
            The total predicted score for this user/item pair
        top_contributions : list
            A list of the top N (itemid, score) contributions for this user/item pair
        user_weights : ndarray
            A factorized representation of the user. Passing this in to
            future 'explain' calls will lead to noticeable speedups
        """
        # user_weights = Cholesky decomposition of Wu^-1
        # from section 5 of the paper CF for Implicit Feedback Datasets
        user_items = user_items.tocsr()
        if user_weights is None:
            A, _ = user_linear_equation(self.item_factors, self.YtY,
                                        user_items, userid,
                                        self.regularization, self.factors)
            user_weights = scipy.linalg.cho_factor(A)
        seed_item = self.item_factors[itemid]

        # weighted_item = y_i^t W_u
        weighted_item = scipy.linalg.cho_solve(user_weights, seed_item)

        total_score = 0.0
        h = []
        h_len = 0
        for itemid, confidence in nonzeros(user_items, userid):
            if confidence < 0:
                continue

            factor = self.item_factors[itemid]
            # s_u^ij = (y_i^t W^u) y_j
            score = weighted_item.dot(factor) * confidence
            total_score += score
            contribution = (score, itemid)
            if h_len < N:
                heapq.heappush(h, contribution)
                h_len += 1
            else:
                heapq.heappushpop(h, contribution)

        items = (heapq.heappop(h) for i in range(len(h)))
        top_contributions = list((i, s) for s, i in items)[::-1]
        return total_score, top_contributions, user_weights
Example #58
0
print("Heap after adding new element:", heap)
'''
heappop()
---------
- removes smallest element from the min Heap.
'''
print("Smallest element of the heap", hq.heappop(heap))
print("Next smallest element of the heap", hq.heappop(heap))
print("Next smallest element of the heap", hq.heappop(heap))
'''
heappushpop()
-------------
- This first adds new element to the heap and then pops the smallest element.
- Pushing new node - 95 to the heap and popping the smallest.
'''
popped_min_node = hq.heappushpop(heap, 95)
print(heap)
'''
heapify()
- This function accepts some random list and then converts it into a heap.
'''
raw_list = [4, 1, 9, 12, 45, 63, 2, 0]
hq.heapify(raw_list)
print("Heapified list:", raw_list)
'''
heapreplace()
-------------
- This function deletes the smallest node and then inserts the new node. It is more efficient than heappop() and heappush()
- We will use the above heapified raw list as an example.
'''
hq.heapreplace(raw_list, 10)
Example #59
0
 def append(self, key, value):
     if len(self.heap) < self.size:
         heapq.heappush(self.heap, SentenceScore(key, value))
     else:
         heapq.heappushpop(self.heap, SentenceScore(key, value))
Example #60
0
heapq.heapify(list)
print(list)

# 删除最小值,因为堆的特征是heap[0]永远是最小的元素,所以一般都是删除第一个元素。
print(list)
heapq.heappop(list)
print(list)

# 删除最小元素值,添加新的元素值
print(list)
heapq.heapreplace(list, 99)
print(list)

# 首先判断添加元素值与堆的第一个元素值对比,如果大,则删除第一个元素,然后添加新的元素值,否则不更改堆
print(list)
heapq.heappushpop(list, 6)
print(list)
heapq.heappushpop(list, 1)
print(list)

# 将多个堆合并
print(list)
h = [1000]
heapq.heapify(h)

for i in heapq.merge(h, list):
    print(i, end=" ")

# 查询堆中的最大元素,n表示查询元素个数
print(list)
print(heapq.nlargest(3, list))