Example #1
0
 def reducer_final(self):
     mk_term_largest = heapq.nlargest(10,self.mk_top10_termlist)
     mk_top10term = [(key,int(count)) for count,key in mk_term_largest]
     ks_term_largest = heapq.nlargest(10,self.ks_top10_termlist)
     ks_top10term = [(key,int(count)) for count,key in ks_term_largest]
     yield None,('mk',mk_top10term)
     yield None,('ks',ks_top10term)
def count_mon_max_data_avg(fileObj, year, cat_flag):
        count = 0
        flag = 0
        avg_data = []
        year_data = []
	fd = open(fileObj, 'r')
        for line in fd.readlines():
                a = re.split(',|\n| ', line)
                # Notice the None value
                if ((int)(a[YEAR]) == year):
                        if (flag == 0):
                                temp = (int)(a[MON])
                                flag = 1
                        if (len(a[cat_flag]) != 0):
                                year_data.append((float)(a[cat_flag]))
                                count = count + 1
                        if(temp != (int)(a[MON])):
				length = (int)(count * RATE)
				if (length == 0):
					length = 1
                        	value = heapq.nlargest(length, year_data)
                       	 	avg = mean(value)
                        	avg_data.append(avg)
                        	year_data = []
                        	count = 0
                        temp = (int)(a[MON])
	length = (int)(count * RATE)
        value = heapq.nlargest(length, year_data)
        avg = mean(value)
        avg_data.append(avg)
        year_data = []
	fd.close()
	return avg_data
Example #3
0
 def mapper_final_term_gettop10(self):
     mk_term_largest = heapq.nlargest(10,self.mk_term)
     ks_term_largest = heapq.nlargest(10,self.ks_term)
     for count,key in mk_term_largest:
         yield ('mk_heap',(count,key))
     for count,key in ks_term_largest:
         yield ('ks_heap',(count,key))
Example #4
0
def timeit_plot3D(data, xlabel='xlabel', ylabel='ylabel', **kwargs): 
    """3D plot of timeit data, one chart per function. 
    """
    dataT = {}
    figs = []
    series = kwargs.get('series', (0,1))
    cmap = kwargs.get('cmap', cm.coolwarm)
    for k, v in data.items():
        dataT[k] = zip(*v)
        fig = plt.figure()
        ax = fig.gca(projection='3d')
        X, Y, Z = dataT[k][series[0]], dataT[k][series[1]], dataT[k][-1]
        wide, tall = (max(X)-min(X)+1), (max(Y)-min(Y)+1)
        intervalX = max(X) - min(heapq.nlargest(2,set(X)))
        intervalY = max(Y) - min(heapq.nlargest(2,set(Y)))
        wide, tall = 1+wide/intervalX, 1+tall/intervalY
        X = np.reshape(X, [wide, tall])
        Y = np.reshape(Y, [wide, tall])
        # TODO: BUG: fix so that Z transposes with x & y reversed
        Z = np.reshape(Z, [wide, tall])
        surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cmap, linewidth=0, antialiased=False)
        ax.zaxis.set_major_locator(LinearLocator(10))
        ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.set_title(substitute_titles(k,series))
        fig.colorbar(surf, shrink=0.5, aspect=5)
        figs.append(fig)
    return figs
Example #5
0
def filter_incoming_data(data):
	hor = data[0:4]
	vert = data[4:]
	deltaf_threshold = 60

	# threshold proximity
	if ((max(hor) < deltaf_threshold) or max(vert) < deltaf_threshold):
		print "LOW: ", data
		return (-1,-1)

	# find largest two values in array
	# store index associated with values too
	# format: [(index1, largestval), (index2, secondlargestval)]
	maxHor = heapq.nlargest(2, enumerate(hor), key=lambda x: x[1])
	maxVert = heapq.nlargest(2, enumerate(vert), key=lambda x: x[1])

	# check that they're neighbors
	if (abs(maxHor[1][0] - maxHor[0][0]) != 1):
		print "Horizontal messed up", maxHor
		return (-1,-1)
	if (abs(maxVert[1][0] - maxVert[0][0]) != 1):
		print "Vertical messed up", maxVert
		return (-1,-1)
	if (maxHor[0][1] < 1) or (maxHor[1][1] < 1) or (maxVert[0][1] < 1) or (maxVert[1][1] < 1):
		print "value is neg"
		return (-1,-1)
	return (maxHor, maxVert)
def sortcat(n, *args):
    
    # check if each argument after n is a string
    for i in args:
        if not isinstance(i, str):
            print("Usage: sortcat(int, string1, string2,...)")
            return 1
    
    from heapq import nlargest
    
    # check if n is -1
    if n == -1:
        x = len(args)
        list = nlargest(x, args, key = len) # make a list of the n longest string arguments
        string = ("").join(list) # join them into a string
        
        print(string)
        return 0
    
    # if n is an integer
    
    list = nlargest(n, args, key = len) # make a list of the n longest string arguments
    string = ("").join(list) # join them into a string
    print(string)
    
    return 0
Example #7
0
    def addElement(self, num):
        if len(self.minHeap) == 0:
            heapq.heappush(self.minHeap, num)
        else:
            minHeapTop = heapq.nsmallest(1,self.minHeap)[0]
            if len(heapq.nlargest(1,self.maxHeap)) == 0:
                heapq.heappush(self.maxHeap, min(num,minHeapTop))
                heapq.heappushpop(self.minHeap, max(num,minHeapTop))
            else:
                maxHeapTop = heapq.nlargest(1,self.maxHeap)[0]

                if num > minHeapTop:
                    tmp = minHeapTop
                    minHeapTop = num
                    num = tmp
                elif num < maxHeapTop:
                    tmp = maxHeapTop
                    maxHeapTop = num
                    num = tmp
                
                heapq.heappushpop(self.minHeap, minHeapTop)
                heapq._heappushpop_max(self.maxHeap, maxHeapTop)
                
                if len(self.minHeap) - len(self.maxHeap) > 0:
                    heapq.heappush(self.maxHeap,num)
                else:
                    heapq.heappush(self.minHeap,num)
Example #8
0
def k_NN():
    similar = {}

    for i, dict in enumerate(training_data):
        similar[i] = len(set(dict['feature_vector']).intersection(set(test_features)))

    knn = nlargest(k_neighbors, similar, key=similar.get)
#    print knn

    category = {'Tech':0, 'Non-Tech':0}

    for neighbor in knn:
        if training_data[neighbor]['label'] == 'Tech':
            category['Tech'] += 1
        else:
            category['Non-Tech'] += 1

#    print category
    label_knn = nlargest(1, category, key=category.get)[0]
    print "k-NN thinks it is a " + label_knn + " article."

    if label_knn=="Tech":
        return 1
    else:
        return 0
Example #9
0
def main():
    logging.basicConfig(level=logging.INFO, format='%(message)s')

    parser = argparse.ArgumentParser(description='Print LDA model')
    parser.add_argument('model', help='trained model')
    args = parser.parse_args()

    with open(args.model) as m:
        model = cPickle.load(m)

    pm = model.pattern_model
    def dec(p):
        pattern = model.pattern_vocabulary[p]
        return '+'.join(model.morpheme_vocabulary[m] for m in pattern)
    patt_prob = ((pm.prob(p), p) for p in xrange(len(model.pattern_vocabulary)))
    for prob, p in heapq.nlargest(100, patt_prob):
        print(u'{0} {1}'.format(dec(p), prob).encode('utf8'))
    print('---------')

    
    for i, topic in enumerate(model.topic_word):
        print('Topic {0}'.format(i))
        stem_topic = topic.base.stem_model
        word_prob = ((stem_topic.prob(w), w) for w in xrange(len(model.stem_vocabulary)))
        for prob, w in heapq.nlargest(10, word_prob):
            print(u'{0} {1}'.format(model.stem_vocabulary[w], prob).encode('utf8'))
        print('---------')
Example #10
0
    def _locate_pinch_points(self, pinch_count: int) -> None:
        """Locate the pinch points.

        Args:
            pinch_count: Number of pinch points.
        """

        spendq = []
        svgq = []
        self.cashflow = collections.OrderedDict(
            sorted(self.cashflow.items(), key=lambda t: t[0]))
        last_k = None
        for k in self.cashflow.keys():
            spend, svg, days = self.cashflow[k]
            if last_k:  # Calculate running totals
                spend += self.cashflow[last_k][0]
                svg += self.cashflow[last_k][1]

            try:  # Calculate and save spending and saving deltas
                spend_d = spend / days
                svg_d = (spend + svg) / days
            except (decimal.InvalidOperation, decimal.DivisionByZero):
                spend_d = svg_d = 0
            self.cashflow[k] = [spend, svg, spend_d, svg_d]

            heapq.heappush(spendq, (spend_d, k))
            heapq.heappush(svgq, (svg_d, k))
            last_k = k

        self.pinch_points_spend = heapq.nlargest(pinch_count, spendq)
        self.pinch_points_spend = [item[1] for item in self.pinch_points_spend]
        self.pinch_points_svg = heapq.nlargest(pinch_count, svgq)
        self.pinch_points_svg = [item[1] for item in self.pinch_points_svg]
Example #11
0
 def getSkyline(self, buildings):
     """
     :type buildings: List[List[int]]
     :rtype: List[List[int]]
     """
     n = len(buildings)
     start = map(lambda x:x[0],buildings)
     end = map(lambda x:x[1], buildings)
     height = map(lambda x: x[2], buildings)
     s_t = zip(start, height, [0] * n)
     e_t = zip(end, height, [1] * n)
     total_t = s_t + e_t
     total_t = sorted(total_t, key=lambda x:(x[0],x[2]))
     h = [0]
     res = []
     heapq.heapify(h)
     for t in total_t:
         if t[2] == 0:
             prev = heapq.nlargest(1,h)[0] if h else 0
             heapq.heappush(h, t[1])
             if prev < t[1]:
                 res.append(t[:2])
         elif t[2] == 1:
             h.remove(t[1])
             heapq.heapify(h)
             top = heapq.nlargest(1,h) if heapq.nlargest(1,h) else [0]
             if top[0] < t[1]:
                 res.append([t[0]] + top)
     return res
         
    def run(self):
        logging.debug("====================\nInitializing Food Processor\n\n")
        start = datetime.now()

        try:
            logging.debug("====================\n Startiing mapping\n\n")
            MapperTask().run()

            logging.debug("====================\n Startiing reducing\n\n")
            ReducerTask().run()

            with open(os.path.join(settings.FILE_PATH, "category_reduce.txt"), "r") as reduce_file:
                content = simplejson.loads(reduce_file.read())

            #heap resolves in o(n lg n)
            top_categories = heapq.nlargest(5, content, key=lambda k: content[k])

            with open(os.path.join(settings.FILE_PATH, "food_reduce.txt"), "r") as reduce_file:
                content = simplejson.loads(reduce_file.read())

            top_foods = heapq.nlargest(100, content, key=lambda k: content[k])

            logging.info("\tTop Foods: {}".format(",".join(top_foods)))
            logging.info("\tTop categories: {}".format(",".join(top_categories)))

        except Exception, e:
            logging.exception("====================\nThere were some problemas while processing food information\n")
    def addNum(self, num):
        """
        Adds a num into the data structure.
        :type num: int
        :rtype: void
        """
        
        smallValue = heapq.nlargest(1,self.small)
        largestValue = heapq.nsmallest(1,self.large)
        
        if num>=largestValue:
            heapq.heappush(self.large,num)
            self.mark1 +=1
        else:
            heapq.heappush(self.small,num)
            self.mark2+=1
            
        if self.mark1==2:
            temp = heapq.heappop(self.large)
            heapq.heappush(self.small,temp)
            self.mark1=0
        if self.mark2==2:
            temp = heapq.nlargest(1,self.small)[0]

            self.small.remove(temp)
            heapq.heapify(self.small)
            heapq.heappush(self.large,temp)
            self.mark2=0
Example #14
0
def proc_unigram_feats():
    mat,key,regy,_ = rs.extract_feats([rs.unigram_feats])
    inv_key = {v:k for k,v in key.items()}
    num_movies,num_words = mat.get_shape()

    movies = [(regy[i],i) for i in range(num_movies)]
    min_movies = heap.nsmallest(MOVIE_TARGET,movies)
    max_movies = heap.nlargest(MOVIE_TARGET,movies)
    tot_min = 0.
    tot_max = 0.
    for mv in min_movies:
        tot_min += mat[mv[1]].sum()
    for mv in max_movies:
        tot_max += mat[mv[1]].sum()
    fix = tot_max/tot_min
    diffs = np.zeros((num_words))
    for mv in min_movies:
        diffs += -1.*fix*mat[mv[1]]
    for mv in max_movies:
        diffs += mat[mv[1]]

    with open("english.stop") as f:
        stop_words = set([line.strip() for line in f.readlines()])
        words = [(diffs[0,i],inv_key[i]) for i in range(num_words)
                 if inv_key[i] not in stop_words]
        worst_words = heap.nsmallest(WORD_TARGET, words)
        worst_words.sort()
        best_words = heap.nlargest(WORD_TARGET, words)
        best_words.sort()

        for wd in worst_words:
            print wd[1] + '\t' + str(wd[0])
        print '---------------------------------'
        for wd in best_words:
            print wd[1] + '\t' + str(wd[0])
Example #15
0
    def get_median(self, n):
        if len(self.hmin) == len(self.hmax):
            if n <= self.median:
                heapq.heappush(self.hmin, n)
                self.median = heapq.nlargest(1, self.hmin)[0]
            else:
                assert (n > self.median)
                heapq.heappush(self.hmax, n)
                self.median = heapq.nsmallest(1, self.hmax)[0]

        elif len(self.hmin) < len(self.hmax):
            if n <= self.median:
                heapq.heappush(self.hmin, n)
                self.median = heapq.nlargest(1, self.hmin)[0]
            else:
                assert (n > self.median)
                item = heapq.heappop(self.hmax)
                heapq.heappush(self.hmin, item)
                heapq.heappush(self.hmax, n)
                self.median = heapq.nlargest(1, self.hmin)[0]

        else: 
            assert (len(self.hmin) > len(self.hmax))
            if n <= self.median:
                item = self._heappop_max(self.hmin)
                heapq.heappush(self.hmax, item)
                heapq.heappush(self.hmin, n)
                self.median = heapq.nlargest(1, self.hmin)[0]
            else:
                assert (n > self.median)
                heapq.heappush(self.hmax, n)
                self.median = heapq.nlargest(1, self.hmin)[0]
        return self.median
Example #16
0
def SeqLenDis(indexes):
    lengths = []
    for index in indexes:
        lengths.append(int(index[2]) - int(index[1]))
    print heapq.nlargest(30, lengths)
    plt.hist(lengths)
    plt.show()
    def accuracy(self, data, convert=False, items=1):
        """Return the number of inputs in ``data`` for which the neural
        network outputs the correct result. The neural network's
        output is assumed to be the index of whichever neuron in the
        final layer has the highest activation.  

        The flag ``convert`` should be set to False if the data set is
        validation or test data (the usual case), and to True if the
        data set is the training data. The need for this flag arises
        due to differences in the way the results ``y`` are
        represented in the different data sets.  In particular, it
        flags whether we need to convert between the different
        representations.  It may seem strange to use different
        representations for the different data sets.  Why not use the
        same representation for all three data sets?  It's done for
        efficiency reasons -- the program usually evaluates the cost
        on the training data and the accuracy on other data sets.
        These are different types of computations, and using different
        representations speeds things up.  More details on the
        representations can be found in
        mnist_loader.load_data_wrapper.

        """
        if convert:
            results = [(map(self.feedforward(x).tolist().index, heapq.nlargest(items, self.feedforward(x).tolist())), np.argmax(y)) for (x, y) in data]
        else:
            results = [(map(self.feedforward(x).tolist().index, heapq.nlargest(items, self.feedforward(x).tolist())), y)
                        for (x, y) in data]	
        counter = 0
        for (x, y) in results:
            if y == x:
                counter = counter + 1
        return counter
def print_top(fn):
    dict = {}
    # parse words from file and store them into dict
    dict = utils.parse_words_from_file(fn)

    # extract top 20 most common words from dict and print
    print heapq.nlargest(20, dict, key=lambda x: dict[x])
Example #19
0
 def test_nlargest(self):
     data = [(random.randrange(2000), i) for i in range(1000)]
     for f in (None, lambda x:  x[0] * 547 % 2000):
         for n in (0, 1, 2, 10, 100, 400, 999, 1000, 1100):
             self.assertEqual(nlargest(n, data), sorted(data, reverse=True)[:n])
             self.assertEqual(nlargest(n, data, key=f),
                              sorted(data, key=f, reverse=True)[:n])
def heap_median_maintenance(read_in):
    starting_list = []
    median = []
    for i in read_in:
        starting_list.append(i)
        #If it's the first element being read in, that is the median
        if len(starting_list) == 1:
            low_heap = heapq.nsmallest(len(starting_list), starting_list)
            high_heap = heapq.nlargest(len(starting_list)-1, starting_list)
        #if even then split half way
        elif len(starting_list)%2 ==0:
            low_heap = heapq.nsmallest(len(starting_list)/2, starting_list)
            high_heap = heapq.nlargest(len(starting_list)/2, starting_list)
        #if odd give the larger portion to low heap
        else:
            low_list_amount = int(math.ceil(float(len(starting_list))/2))
            high_list_amount = int(len(starting_list) - math.ceil(float(len(starting_list))/2))
            low_heap = heapq.nsmallest(low_list_amount, starting_list)
            high_heap = heapq.nlargest(high_list_amount, starting_list)
            #print("Low heap has {} and high heap has {}".format(len(low_heap), len(high_heap)))
        #print("Low heap {}".format(low_heap))
        #print("high heap {}".format(high_heap))
        #print("Median is {}".format(heapq.nlargest(1, low_heap)[0]))
        #append median from the largest element of the low_heap
        median.append(heapq.nlargest(1, low_heap)[0])
    return median
Example #21
0
	def mostcommon(self, n, which_kind):
		num = n
		print "--------------------------------------------------------------------"
		if which_kind == "continuous":
			if self.continuous == 0:
				print "No continuous connectives found!"
			else:
				print "Printing top " + str(n) + " continuous connectives:" #, which is " + str(num) + " items:"
			
				for key, freq in nlargest(num, self.continuous_dict.iteritems(), key=itemgetter(1)):
					print "\t", key.part_one[0], "occurs ", self.continuous_dict[key], " times, which is ", float(self.continuous_dict[key])/float(self.continuous)*100, " percent."
			
		elif which_kind == "discontinuous":
			if self.discontinuous == 0:
				print "No discontinuous connectives found!"
			else:
				print "Printing top " + str(n) + " discontinuous connectives:" #, which is " + str(num) + " items:"	
			
				for key, freq in nlargest(num, self.discontinuous_dict.iteritems(), key=itemgetter(1)):
					print "\t", key.part_one[0], " ... ", key.part_two[0], "occurs ", self.discontinuous_dict[key], " times, which is ", float(self.discontinuous_dict[key])/float(self.discontinuous)*100, " percent."
			
		elif which_kind == "ambiguous":
			if self.ambiguous() == 0:
				print "No ambiguous connectives found!"
			else:
				print "Printing top " + str(n) + " ambiguous connectives:" #, which is " + str(num) + " items:"	

				for key, freq in nlargest(num, self.ambiguous_dict.iteritems(), key=itemgetter(1)):
					print "\t", key.part_one[0], 
					if key.sep == "continuous":
						print " ... ", key.part_two[0],
					print "occurs ", self.ambiguous_dict[key], " times, which is ", float(self.ambiguous_dict[key])/float(self.ambiguous())*100, " percent."

		return
Example #22
0
def show_top(model):
    top_prefixes = heapq.nlargest(10, izip(model.base.theta_p.counts, model.prefix_vocabulary))
    n_prefixes = sum(1 for c in model.base.theta_p.counts if c > 0)
    logging.info('Top prefixes (10/%d): %s', n_prefixes, ' '.join(prefix+':'+str(c) for c, prefix in top_prefixes))
    top_suffixes = heapq.nlargest(10, izip(model.base.theta_s.counts, model.suffix_vocabulary))
    n_suffixes = sum(1 for c in model.base.theta_s.counts if c > 0)
    logging.info('Top suffixes (10/%d): %s', n_suffixes, ' '.join(suffix+':'+str(c) for c, suffix in top_suffixes))
Example #23
0
    def DumpAudioDiagnostics(self, dir_name="./data/", top_k=10, bot_k=10):
        # utterance level diag
        import heapq

        utt_largest = heapq.nlargest(top_k, self.utt_feature, key=self.utt_feature.get)
        i = 0
        for utt in utt_largest:
            utt_id = string.join(utt.split("_")[0:-2], "_")
            t_beg = float(utt.split("_")[-2]) / self.samp_period
            t_end = float(utt.split("_")[-1]) / self.samp_period
            file_id = self.list_files[self.map_utt_idx[utt_id]]
            out_file = "./data/" + repr(i) + "large_srate_" + os.path.basename(file_id).split(".")[0] + ".wav"
            util.cmdconvert(file_id, out_file, t_beg, t_end)
            i += 1
        utt_smallest = heapq.nsmallest(bot_k, self.utt_feature, key=self.utt_feature.get)
        i = 0
        for utt in utt_smallest:
            utt_id = string.join(utt.split("_")[0:-2], "_")
            t_beg = float(utt.split("_")[-2]) / self.samp_period
            t_end = float(utt.split("_")[-1]) / self.samp_period
            file_id = self.list_files[self.map_utt_idx[utt_id]]
            out_file = "./data/" + repr(i) + "small_srate_" + os.path.basename(file_id).split(".")[0] + ".wav"
            util.cmdconvert(file_id, out_file, t_beg, t_end)
            i += 1
        # glob level diag
        glob_largest = heapq.nlargest(top_k, self.glob_feature, key=self.glob_feature.get)
        for utt_id in glob_largest:
            file_id = self.list_files[self.map_utt_idx[utt_id]]
            out_file = "./data/glob_large_srate_" + os.path.basename(file_id).split(".")[0] + ".wav"
            util.cmdconvert(file_id, out_file)
        glob_smallest = heapq.nsmallest(top_k, self.glob_feature, key=self.glob_feature.get)
        for utt_id in glob_smallest:
            file_id = self.list_files[self.map_utt_idx[utt_id]]
            out_file = "./data/glob_small_srate_" + os.path.basename(file_id).split(".")[0] + ".wav"
            util.cmdconvert(file_id, out_file)
Example #24
0
def get_top_k_users(user, all_users, k, similarity_metric, is_user_generated=False):
    similarity_funct = user.get_similarity_funct(similarity_metric)
    # Discount the first user since it is the same as the one being
    # compared against
    if not is_user_generated:
        return heapq.nlargest(k+1, all_users, similarity_funct)[1:]
    else:
        return heapq.nlargest(k, all_users, similarity_funct)
def getmedian(l, r):
    # print('Median:', l, '-', rightq)
    if len(l) > len(r):
        return heapq.nlargest(1, l)[0]
    elif len(l) == len(r):
        return (heapq.nlargest(1, l)[0] + heapq.nsmallest(1, r)[0]) / 2
    else:
        return heapq.nsmallest(1, r)[0]
Example #26
0
def test():
    list_test=list('abcde')
    heapq.heapify(list_test)
    logger.error(heapq.nlargest(3, list_test))
    list_b = list('bcdef')
    heapq.heapify(list_b)
    list_res = [i for i in heapq.merge(list_test, list_b)]
    logger.error(heapq.nlargest(3, list_res))
def getClosestCases(list_of_label_files, list_of_similarity_xml_files, \
    similarity, num_closest_cases):
    """Get the closest cases to the test case using some similarity metric and
       given a list of files that contain the similarity value. 

    Parameters
    ----------
    list_of_similarity_xml_files : list of file names (string)
        Each file name points to an xml file that contains the similarity value

    list_of_label_files : list of file names (string)
        Each file name points to a file that has a labeled image
        
    similarity : string
        Type of similarity used
        
    num_closest_cases : int
        The number of closest cases to return
        ...
        
    Returns
    -------
    closest_cases : string 2D list with shape (2, num_cases)
         Contains the num_cases cases with highest similarity to the case 
         being tested, and the associated similarity value
        ...
    """

    #TODO: Account for different similarities
    
    num_training_cases = len(list_of_label_files)
    
    #Read the similarity values 
    similarity_values = getMISimilarityVec(list_of_similarity_xml_files)
    print(similarity_values)

    #find highest matches    
    indexes=[]
    for i in range(num_training_cases):
        indexes.append(i)
    if (similarity == "NCC"):
        nlargestvalues = heapq.nlargest(num_closest_cases, indexes, key=lambda \
            i: abs(float(similarity_values[i]))) #take (from 0 to 10, assuming testint is not in trainng)
    else:
        nlargestvalues = heapq.nlargest(num_closest_cases, indexes, key=lambda \
            i: (similarity_values[i])) #take (from 0 to 10, assuming testint is not in trainng)        
    print(nlargestvalues)
    patient_atlas_labelmaps = [""]*num_closest_cases
    patient_atlas_similarity = [1.0]*num_closest_cases
    
    #Now store the num_cases in a 2D list
    for i in range(num_closest_cases): 
        patient_atlas_labelmaps[i] = list_of_label_files[nlargestvalues[i]]
        print(patient_atlas_labelmaps[i])
        patient_atlas_similarity[i] = similarity_values[nlargestvalues[i]]
    
    closest_cases = np.vstack((patient_atlas_labelmaps, patient_atlas_similarity))
    return closest_cases
Example #28
0
 def _map(self, key, item):
     statistics = item[self.dname]
     total = item[self.tname]
     
     if total == 0.0:
         return None
     
     days = 0
     weeks = 0
     months = 0
     
     data_day = {}
     data_week = {}
     data_month = {}
     
     sum_week = 0.0
     sum_month = 0.0
     
     for i in xrange(len(statistics)):
         stat = statistics[i]
         days = i
         
         sum_week += stat
         sum_month += stat
         
         data_day[days] = stat
         if days % 7 == 0:
             data_week[weeks] = sum_week
             weeks += 1
             sum_week = 0.0
         if days % 30 == 0:
             data_month[months] = sum_month
             months += 1
             sum_month = 0.0
     
     daily_top = nlargest(3, data_day.values())
     weekly_top = nlargest(3, data_week.values())
     monthly_top = nlargest(3, data_month.values())
     
     daily_frac = None
     if len(daily_top) >= 3:
         daily_frac = (daily_top[0] / total, 
                       daily_top[1] / total, 
                       daily_top[2] / total)
     
     weekly_frac = None    
     if len(weekly_top) >= 3:
         weekly_frac = (weekly_top[0] / total, 
                        weekly_top[1] / total,
                        weekly_top[2] / total)
     
     monthly_frac = None    
     if len(monthly_top) >= 3:
         monthly_frac = (monthly_top[0] / total, 
                         monthly_top[1] / total, 
                         monthly_top[2] / total)
     
     return (daily_frac, weekly_frac, monthly_frac)
Example #29
0
 def clientLatestSnapshot(self, onlyTs):
     clientTsList = self.subvolSplitTsList(self.clientSubvolList(withUUID=False))
     if onlyTs is True:
         return heapq.nlargest(1, clientTsList)[0]
     else:
         for subvol in self.clientSubvolList(withUUID=False):
             if heapq.nlargest(1, clientTsList)[0] in subvol:
                 logInfo("isLockfile(): Newest client subvolume is %s" % subvol)
                 return subvol
Example #30
0
 def returnNLargest(self, n):
     # return the N largest values in the data stored in the heap.
     # if n is less than k, return n largest data
     # if not, return the data of size k
     if(n <= self.__k):
         return heapq.nlargest(n, self.__data)
     else:
         print "n is greater than the size of data"
         return heapq.nlargest(self.__k, self.__data)
Example #31
0
    def choose_playlist(self):
        if not self.candidate_playlists:
            return None

        print('genre_weights')

        # Note that, at this point, we only have genre information of five tracks per playlist (this
        # is the information that SoundCloud usually returns for playlist requests).
        tracks_genre_distr = list(self.get_free_tracks_genre_distr().items())
        tracks_genre_distr.sort(key=lambda item: item[1])
        self.genre_weights = {genre: 0.85**(rank+1)
                                     if genre not in IGNORE_GENRES and \
                                        genre != 'others' and \
                                        genre != 'unknown'
                                     else 0.0
                              for rank, (genre, _) in enumerate(tracks_genre_distr)}

        print('candidates')

        # The below scoring code is pretty slow, and the number of candidate playlists grows over
        # time. We can try to speed it up by sampling a random subset of candidates in each step.
        candidates = random.sample(list(self.candidate_playlists.items()),
                                   k=20000)

        print('scores')

        weights = []
        for _, candidate in candidates:
            track_values = []
            new_tracks = 0

            for track_info in candidate['tracks']:
                if track_info['id'] not in self.tracks:
                    new_tracks += 1

                if self.is_complete_track_info(track_info):
                    mapped_genre = map_genre(track_info['genre'])
                    if mapped_genre == 'ignore':
                        track_value = 0.0
                    else:
                        track_value = 1.0
                        track_value *= 1.0 if self.is_free(track_info['license']) else 0.00005
                        track_value *= 1.0 if self.is_track_okay(track_info) else 0.01
                        track_value *= 1.0 if track_info['id'] not in self.tracks else 0.01
                        track_value *= self.genre_weights.get(mapped_genre, 0.0)

                    track_values.append(track_value)

            if len(track_values) == 0:
                weights.append(0.0)
            else:
                size_mult = 2/(1+math.exp(-new_tracks/20))-1
                new_ratio = new_tracks / len(candidate['tracks'])
                mean_score = new_ratio * np.mean(track_values)
                score = size_mult * mean_score
                weights.append(math.exp(10000.0 * score))

        print('topk')

        candidates_weights = zip(candidates, weights)
        candidates_weights = heapq.nlargest(50,
                                            candidates_weights,
                                            key=lambda pair: pair[1])
        weights = [pair[1] for pair in candidates_weights]

        #for item, weight in candidates_weights[-50:]:
        #    print(f'    {weight}\t'
        #          f'{item[1]["genre_distr"]}\t'
        #          f'{item[1]["freeness"]}\t'
        #          f'{self.calc_genre_novelty(item[1]["genre_distr"])}')

        print('sample')

        choice_item, choice_weight = random.choices(candidates_weights, weights=weights)[0]

        print(f'    playlist_id: {choice_item[0]}, '
              f'weight: {choice_weight}')

        return choice_item[0]
Example #32
0
# 获取最大最小的N个元素
import heapq
nums = [1, 8, 2, 23, 7, -4, 18, 23, 42, 37, 2]
print(heapq.nlargest(3, nums))  # [42, 37, 23] # 获取最大的三个数
print(heapq.nsmallest(1, nums))  # [-4]  # 获取最小数
Example #33
0
#!coding=utf-8

"""
    查找最大或最小的N个元素
"""

# 从一个集合中获得最大或者最小的N个元素列表
import heapq

nums = [1, 8, 2, 23, 7, -4, 18, 23, 42, 37, 2]
print heapq.nlargest(3, nums)  # 维护一个最小堆
print heapq.nsmallest(3, nums)  # 维护一个最大堆

portfolio = [
    {'name': 'IBM', 'shares': 100, 'price': 91.1},
    {'name': 'AAPL', 'shares': 50, 'price': 543.22},
    {'name': 'FB', 'shares': 200, 'price': 21.09},
    {'name': 'HPQ', 'shares': 35, 'price': 31.75},
    {'name': 'YHOO', 'shares': 45, 'price': 16.35},
    {'name': 'ACME', 'shares': 75, 'price': 115.65}
]

cheap = heapq.nsmallest(3, portfolio, key=lambda s: s['price'])
expensive = heapq.nlargest(3, portfolio, key=lambda  s: s['price'])
print cheap
print expensive

# 如果想在一个集合中最小或最大的N个元素,并且N小于集合元素数量,那么heapqq能提供很好
# 的性能,底层实现是堆,时间复杂度为O(logn)
heap = list(nums)
heapq.heapify(heap)  # 把列表转换成堆,为最小堆
Example #34
0
    print('Making Dev preds')
    json_preds = {}
    json_preds['questions'] = []
    num_docs = 0
    for i in range(len(data['queries'])):
        num_docs += 1
        dy.renew_cg()

        qtext = data['queries'][i]['query_text']
        qwds, qvecs, qconv = model.MakeInputs(qtext)

        rel_scores = {}
        for j in range(len(data['queries'][i]['retrieved_documents'])):
            doc_id = data['queries'][i]['retrieved_documents'][j]['doc_id']
            dtext = (docs[doc_id]['title'] + ' <title> ' +
                     docs[doc_id]['abstractText'])
            dwds, dvecs, dconv = model.MakeInputs(dtext)
            bm25 = data['queries'][i]['retrieved_documents'][j][
                'norm_bm25_score']
            efeats = model.GetExtraFeatures(qtext, dtext, bm25)
            efeats_vec = dy.inputVector(efeats)
            score = model.GetQDScore(qwds, qconv, dwds, dconv, efeats_vec)
            rel_scores[j] = score.value()

        top = heapq.nlargest(10, rel_scores, key=rel_scores.get)
        utils.JsonPredsAppend(json_preds, data, i, top)
        dy.renew_cg()

    utils.DumpJson(json_preds, 'abel_dev_preds_ep' + str(epoch) + '.json')
    print('Done')
                   loss="categorical_crossentropy",
                   metrics=["categorical_accuracy"])
     model.load_weights(
         '/data/scene/scene_classification_res_gpu/weights/scene_InRes.01-1.33.h5'
     )
 if loadmode == 'model':
     model = load_model(
         '/data/scene/scene_classification_res_1dense/weights/scene_InRes_Dropout0.00_Lr1.00e-05_Densen1_01-2.87.h5'
     )
 for index in range(len(val_image_paths)):
     imgpath = val_image_paths[index]
     ture_score = ture_val_scores[index]
     img = preprocess(imgpath, image_size)
     scores = model.predict(np.array([img]))[0]
     top_number = 1
     top_key = heapq.nlargest(top_number, range(len(scores)),
                              scores.take)
     print('---top_key', top_key)
     top_value = heapq.nlargest(top_number, scores)
     print('---top_value', top_value)
     if ture_score in top_key:
         accuracy += 1
     else:
         basename = os.path.basename(imgpath)
         basename = basename.split('.')[0].split(
             '_')[0] + '_pre_' + str(top_key[0]) + '_f_' + str(
                 top_value[0]) + '.jpg'
         outpath = os.path.join(outdir, basename)
         shutil.copy(imgpath, outpath)
     imgrecord.append(imgpath)
     scorerecord.append(scores)
 accuracys.append(accuracy / len(val_image_paths))
Example #36
0
def prediction():
    """
    prediction interface
    :return:
    """
    print("prediction")
    log_path = os.path.join(".", config['log_path'])
    logger = get_logger(log_path)
    map_file_path = "./pkl/" + domain + "/data.pkl"
    if os.path.isfile(map_file_path):
        with open(map_file_path, "rb") as f:
            id2mid, id2p, entity_entity_sim_Matrix, entity_relation_Adj, truth_label, \
            train_entity_list, test_entity_list, valid_entity_list, entity_size, relation_size = pickle.load(f)
            config['entity_size'] = entity_size
            config['relation_size'] = relation_size
    else:
        id2mid, id2p, entity_entity_sim_Matrix, entity_relation_Adj, truth_label, train_entity_list, \
        test_entity_list, valid_entity_list = data_reader(logger, config=config, domain=domain,
                                                          entity_entity_topk=entity_knn_number)
        with open(map_file_path, "wb") as f:
            pickle.dump([
                id2mid, id2p, entity_entity_sim_Matrix, entity_relation_Adj,
                truth_label, train_entity_list, test_entity_list,
                valid_entity_list, config['entity_size'],
                config['relation_size']
            ], f)
    test_data = (entity_relation_Adj, entity_entity_sim_Matrix,
                 test_entity_list, truth_label)
    test_manager = BatchManager(test_data, config['batch_size'], "test")

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    config['decay_steps'] = 100
    config['ckpt_path'] = "result/" + datasettype + "/" + domain + "/"
    rec_results = {}
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, config['ckpt_path'], config, logger)
        for batch in test_manager.iter_batch():  # each batch is an entity
            entity_relation_Adj, entity_entity_sim_Matrix, input_entity, input_relation, label = batch
            relation_z = model.run_step(sess, False, batch)
            relation_z = relation_z[0]

            reserved_prop_list = []
            for p in range(len(entity_relation_Adj[input_entity[0]])):
                if entity_relation_Adj[input_entity[0]][p] >= 1:
                    reserved_prop_list.append(p)

            entityid = input_entity[0]
            for index in range(len(input_entity)):  # each batch is an entity
                # entityid = input_entity[index]
                relationid = input_relation[index]
                if entity_relation_Adj[entityid][
                        relationid] >= 1:  # delete pre-reserved properties
                    relation_z[index] = -2.0

            map_property_score = {
                key: value
                for key, value in enumerate(relation_z)
            }
            exps_list = heapq.nlargest(k,
                                       map_property_score,
                                       key=map_property_score.get)

            rec_results[id2mid[entityid]] = [id2p[pid] for pid in exps_list]
    return rec_results
Example #37
0
def topmost(img, top, title=''):
    n_labels = set(img.ravel())
    print(title, heapq.nlargest(top, n_labels))
    return int(np.max(list(n_labels))), int(np.mean(list(n_labels)))
 def closest_to_vec(self, vec, n=10):
     # scores = ((self.m.dot(vec) + 1.0) / 2)
     scores = self.m.dot(vec)
     return heapq.nlargest(n, zip(scores, self.iw))
Example #39
0
def evaluateSimilarityJSON(screen_name,new_tweet):

	sentiClassA = cs.sentiClass(new_tweet)

	rake_object = RAKE.Rake(stoplist_file)
	wiki.set_lang("en")

	keyword = rake_object.run(new_tweet)
	# print keyword
	# print wiki.summary(str(keyword[0][0])).encode("utf-8")

	# try :
	# 	summ_wiki = wiki.summary(str(keyword[0][0])).encode("utf-8")
	# except (wiki.exceptions.PageError,wiki.exceptions.DisambiguationError) as e:
	# 	try:
	# 		if keyword[1][1] and keyword[1][1] > 1:
	# 			summ_wiki = wiki.summary(str(keyword[1][0])).encode("utf-8")
	# 	except (wiki.exceptions.PageError,wiki.exceptions.DisambiguationError) as e:
	# 		summ_wiki = str(keyword[1][0])
	# 	aug_tweet = new_tweet.encode('utf-8') + " " + summ_wiki
	# else :
	# 	aug_tweet = new_tweet.encode('utf-8') + " " + summ_wiki

	# print '\nevaluateSimilarity for tweet:', aug_tweet

	# preprocessed_msg = cs.slangCleanser(aug_tweet)
	preprocessed_msg = cs.slangCleanser(new_tweet)
	preprocessed_msg = cs.lemmatizingText(preprocessed_msg)
	ldaScoreA = cs.countDocLDA(preprocessed_msg)

	with open(str(DIR)+'/'+str(screen_name)+'_datasets/'+screen_name+'_followers.txt') as f:
		followers = f.read().splitlines()

	tweet_counter = 0
	recommendation_list = collections.defaultdict(list)
	recommendation_list_np = collections.defaultdict(list)
	recommendation_list_recent = collections.defaultdict(list)
	recommendation_list_rt = collections.defaultdict(list)
	recommendation_list_hashtag = collections.defaultdict(list)
	recommendation_list_senti = collections.defaultdict(list)
	recommendation_list_prof = collections.defaultdict(list)

	recommendation_list_c = collections.defaultdict(list)

	recommendation_list_fnr = collections.defaultdict(list)
	recommendation_list_csi = collections.defaultdict(list)
	recommendation_list_cb = collections.defaultdict(list)
	recommendation_list_eb = collections.defaultdict(list)

	for user in followers:
		if os.path.exists(str(DIR)+'/'+str(screen_name)+'_datasets/'+str(user)+'/'+ str(user) +'.txt'):
			with open(str(DIR)+'/'+str(screen_name)+'_datasets/'+str(user)+'/'+ str(user) +'.txt') as f:
				all_tweets = f.read().splitlines()

		if len(all_tweets) < 2:
			print "pass", user
			continue

		print 'estimating for user: '******'/'+str(screen_name)+'_datasets/'+ str(user) +'/'+ str(user) +'_UserProfilingLDA.json'):

			with open(str(DIR)+'/'+str(screen_name)+'_datasets/'+ str(user) +'/'+ str(user) +'_UserProfilingLDA.json') as json_data:
				d = json.load(json_data)

				#User profile description relevance. capture user profile desciption start here:
				if d['profile'] != "":
					profileInterest = 1-cs.JSD_dist(d['profile'],ldaScoreA)
				else:
					profileInterest = 0

				#count Recent Hashtag tweets Score start here:
				if d['hashtag'] != "":
					hashtagsInterest = 1-cs.JSD_dist(d['hashtag'],ldaScoreA) 
				else:
					hashtagsInterest = 0


				#count Recent Tweets Score start here:
				if d['recent'] != "":
					recentInterest = 1-cs.JSD_dist(d['recent'],ldaScoreA) 

					if os.path.exists(str(DIR)+'/'+str(screen_name)+'_datasets/'+str(user)+'/'+ str(user) +'.txt'):
						#sentiment score features
						"Open file again for sentiment based"
						with open(str(DIR)+'/'+str(screen_name)+'_datasets/'+str(user)+'/'+ str(user) +'.txt') as f:

							data_tweets = f.read().splitlines()
							sentiScore = cs.sentiInterest(data_tweets,ldaScoreA,sentiClassA)
							sentiInterest = cs.sentiInterest(data_tweets,ldaScoreA,sentiClassA)

						"Open file for EAR cosine_sim"
						with open(str(DIR)+'/'+str(screen_name)+'_datasets/'+str(user)+'/'+ str(user) +'.txt') as f:

							aggregate_tweets = f.read().replace('\n', ' ')
							cosine_sim = cs.cosine_sim(new_tweet,aggregate_tweets)
				else:
					recentInterest = 0
					continue


				#count recent Retweet Score start here:
				if d['sharing'] != "":

					sharingInt = 1-cs.JSD_dist(d['sharing'],ldaScoreA) 
					num_rt = int(d['num_rt'])

				else:
					sharingInt = 0
					num_rt = 0


				#count number of follower Score start here:

				num_followers = int(d['num_followers'])
				empirical_based = num_followers*(cosine_sim*int(num_rt))


				CSI = (0.4*(recentInterest)) + (0.3*(sharingInt)) + (0.3*(hashtagsInterest))
				Topic_based = (0.25*(recentInterest)) + (0.2*(sharingInt)) + (0.2*(hashtagsInterest) + (0.15*(profileInterest)))

				#If new tweet polarity is neutral, don’t consider sentiment feature 

				#If new tweet polarity is neutral, don’t consider sentiment feature 
				if sentiClassA == "NEUTRAL":
					sentiScore = 0
				SeAT = Topic_based + (0.25*sentiScore)
				SeAT_NP = CSI + sentiScore
				# print 'SeAT: ', SeAT,'; recentInterest: ', recentInterest,'; sharingInt: ', sharingInt,'; hashtagsInterest: ', hashtagsInterest,'; profileInterest: ', profileInterest,'; sentiInterest: ', sentiInterest
				print "--------\n"
				recommendation_list[user].append(SeAT)
				recommendation_list_np[user].append(SeAT_NP)
				recommendation_list_recent[user].append(recentInterest)
				recommendation_list_rt[user].append(sharingInt)
				recommendation_list_hashtag[user].append(hashtagsInterest)
				recommendation_list_senti[user].append(sentiInterest)
				recommendation_list_prof[user].append(profileInterest)

				recommendation_list_csi[user].append(CSI)
				recommendation_list_fnr[user].append(num_followers)
				recommendation_list_eb[user].append(empirical_based)


	# Proposed System
	top_15_SeAT = heapq.nlargest(15, recommendation_list, key=recommendation_list.get)
	top_15_SeAT_NP = heapq.nlargest(15, recommendation_list_np, key=recommendation_list.get)
	top_15_rt = heapq.nlargest(15, recommendation_list_rt, key=recommendation_list_rt.get)
	top_15_hashtag = heapq.nlargest(15, recommendation_list_hashtag, key=recommendation_list_hashtag.get)
	top_15_senti = heapq.nlargest(15, recommendation_list_senti, key=recommendation_list_senti.get)
	top_15_prof = heapq.nlargest(15, recommendation_list_prof, key=recommendation_list_prof.get)
	top_15_recent = heapq.nlargest(15, recommendation_list_recent, key=recommendation_list_recent.get)

	top_15_csi = heapq.nlargest(15, recommendation_list_csi, key=recommendation_list_csi.get)
	top_15_fnr = heapq.nlargest(15, recommendation_list_fnr, key=recommendation_list_fnr.get)
	top_15_eb = heapq.nlargest(15, recommendation_list_eb, key=recommendation_list_eb.get)

	with open(str(RESULT_PATH)+'/result_'+str(screen_name)+'.csv', 'a') as csvfile:
	    evaluation = csv.writer(csvfile,quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
	    evaluation.writerow([new_tweet,top_15_SeAT,top_15_SeAT_NP,top_15_csi,top_15_eb,top_15_fnr,top_15_recent,top_15_rt,top_15_hashtag,top_15_senti,top_15_prof])
        if len(self.max_heap) == len(self.min_heap):
            return (self.max_heap[0] + self.min_heap[0]) / 2
        else:
            return self.max_heap[0]


if __name__ == '__main__':
    mf = MedianFinder()
    mf.add_num(5)
    mf.add_num(4)
    mf.add_num(9)
    mf.add_num(7)
    mf.add_num(3)
    print(mf.min_heap)
    print(mf.max_heap)
    print(mf.find_median())

    import heapq

    # 向堆中插入元素,heapq会维护列表heap中的元素保持堆的性质
    heapq.heappush()
    # heapq把列表x转换成堆
    heapq.heapify()
    # 从可迭代的迭代器中返回最大的n个数,可以指定比较的key
    heapq.nlargest()
    # 从可迭代的迭代器中返回最小的n个数,可以指定比较的key
    heapq.nsmallest()
    # 从堆中删除元素,返回值是堆中最小或者最大的元素
    heapq.heappop()

Example #41
0
def evaluate(sess, model, name, data, logger, id2mid, id2p):
    logger.info("evaluate data:{}".format(name))
    all_precision = 0
    all_ndcg = 0
    test_entity_cnt = 0
    all_map = 0
    all_ndcg_topall = 0

    for batch in data.iter_batch():
        entity_relation_Adj, entity_entity_sim_Matrix, input_entity, input_relation, label = batch
        relation_z = model.run_step(sess, False, batch)  # batch_size*1
        relation_z = relation_z[0]

        test_entity_cnt += 1  # each batch is an entity
        reserved_prop_list = []
        for p in range(len(entity_relation_Adj[input_entity[0]])):
            if entity_relation_Adj[input_entity[0]][p] >= 1:
                reserved_prop_list.append(p)

        for index in range(len(input_entity)):
            entityid = input_entity[index]
            relationid = input_relation[index]
            if entity_relation_Adj[entityid][relationid] >= 1:
                relation_z[index] = -2.0  # exclude pre-reserved properties

        truth = label
        truth_set = set()
        for a in range(len(truth)):
            if truth[a] == 1 and a not in reserved_prop_list:  # exclude pre-reserved properties
                truth_set.add(a)

        map_property_score = {
            key: value
            for key, value in enumerate(relation_z)
        }
        exps_list = heapq.nlargest(k,
                                   map_property_score,
                                   key=map_property_score.get)
        precision = precision_at_k(exps_list, truth_set, k)
        ndcg = ndcg_at_k(exps_list, truth_set, k)

        ranklist_topall = sorted(map_property_score.items(),
                                 key=lambda d: d[1],
                                 reverse=True)
        ranklist_topall = [key for key, value in ranklist_topall]
        map_topall = ap(ranklist_topall, truth_set)
        ndcg_topall = ndcg_at_k(ranklist_topall, truth_set, 10000)

        all_precision += precision
        all_ndcg += ndcg
        all_map += map_topall
        all_ndcg_topall += ndcg_topall

    all_precision = all_precision / test_entity_cnt
    all_ndcg = all_ndcg / test_entity_cnt
    all_map = all_map / test_entity_cnt
    all_ndcg_topall = all_ndcg_topall / test_entity_cnt

    if name == "valid":
        best_dev_precision = model.best_dev_precision.eval()
        best_dev_ndcg = model.best_dev_ndcg.eval()
        if all_precision >= best_dev_precision and all_ndcg >= best_dev_ndcg:
            tf.assign(model.best_dev_precision, all_precision).eval()
            tf.assign(model.best_dev_ndcg, all_ndcg).eval()
            logger.info("new best dev precision at {} :{:>.5f}".format(
                k, all_precision))
            logger.info("new best dev ndcg at {} :{:>.5f}".format(k, all_ndcg))
        return all_precision > best_dev_precision and all_ndcg > best_dev_ndcg

    elif name == "test":
        best_test_precision = model.best_test_precision.eval()
        best_test_ndcg = model.best_test_ndcg.eval()
        if all_precision >= best_test_precision and all_ndcg >= best_test_ndcg:
            tf.assign(model.best_test_precision, all_precision).eval()
            tf.assign(model.best_test_ndcg, all_ndcg).eval()
            logger.info("new best test precision at {} :{:>.5f}".format(
                k, all_precision))
            logger.info("new best test ndcg at {} :{:>.5f}".format(
                k, all_ndcg))

        return all_precision, all_ndcg, all_map, all_ndcg_topall
Example #42
0
        if (val_frecuencia == None):
            val_frecuencia = 0
        else:
            puntaje_final += val_frecuencia
    return puntaje_final


cadena_hex = "1b37373331363f78151b7f2b783431333d78397828372d363c78373e783a393b3736"
#cadena = cadena_hex.decode("hex")
cadena = bytes.fromhex(cadena_hex).decode('utf-8')
spell = SpellChecker()
res = ''
puntaje_final = 0
puntaje_final = []
for i in range(256):
    for j in cadena:
        res_byte = ord(j) ^ i
        res += chr(res_byte)
    puntaje_actual = analiza_frecuencia(res)
    puntaje_final.append((puntaje_actual, res, i))
    res = ''
ult = heapq.nlargest(3, puntaje_final)

for i in range(3):
    #print(ult[i][1])
    palabras = ult[i][1].split(' ')
    mejor_palabra = spell.known(palabras)
    if (mejor_palabra):
        print(ult[i][1])
        break
import heapq as heap

L = []
heap.heappush(L, 20)
heap.heappush(L, 14)
heap.heappush(L, 5)
heap.heappush(L, 15)
heap.heappush(L, 10)
heap.heappush(L, 2)

print(L)
print(heap.heappop(L))
print(L)
print(heap.heappushpop(L, 18))
print(L)

L1 = heap.nlargest(3, L)
print(L1)
L2 = heap.nsmallest(3, L)
print(L2)

L3 = [20, 14, 2, 15, 10, 21]
print(L3)
heap.heapify(L3)
print(L3)
Example #44
0
def preprocesser(vals,
                 refine=2,
                 Rm_Outliers=False,
                 Filter=True,
                 Median=False,
                 Mean=True):

    # Determine spatial resolution
    resolution = vals.shape[0]

    if Rm_Outliers:
        # Identify and remove outliers
        outlier_buffer = 5

        vals_list = vals.reshape((resolution * resolution, ))
        vals_mins = heapq.nsmallest(outlier_buffer, vals_list)
        vals_maxes = heapq.nlargest(outlier_buffer, vals_list)

        # Cap max and min
        vals_min = np.max(vals_mins)
        vals_max = np.min(vals_maxes)

        # Trim outliers
        over = (vals > vals_max)
        under = (vals < vals_min)

        # Remove outliers
        vals[over] = vals_max
        vals[under] = vals_min

    else:
        vals_min = np.max(vals)
        vals_max = np.min(vals)

    if Filter:
        # Apply median/mean filter
        if Median:
            vals = median_filter(vals)
        if Mean:
            vals = mean_filter(vals)

    # Create grid
    start = 0.0
    end = 1.0
    x = np.linspace(start, end, resolution)
    y = np.linspace(start, end, resolution)

    [X, Y] = np.meshgrid(x, y)

    interp_vals = interp2d(x, y, vals, kind='cubic')

    # Create refined grid
    plot_start = 0.0
    plot_end = 1.0

    plot_x = np.linspace(plot_start, plot_end, refine * resolution)
    plot_y = np.linspace(plot_start, plot_end, refine * resolution)

    [plot_X, plot_Y] = np.meshgrid(plot_x, plot_y)

    vals_int_values = interp_vals(plot_x, plot_y)

    return vals_int_values, plot_X, plot_Y
Example #45
0
def extract(query, choices, *, scorer=quick_ratio, score_cutoff=0, limit=10):
    it = _extraction_generator(query, choices, scorer, score_cutoff)
    key = lambda t: t[1]
    if limit is not None:
        return heapq.nlargest(limit, it, key=key)
    return sorted(it, key=key, reverse=True)
        pool_subset = 2000
        pool_subset_dropout = np.asarray(
            random.sample(range(0, X_Pool.shape[0]), pool_subset))
        X_Pool_Dropout = X_Pool[pool_subset_dropout, :, :, :]
        y_Pool_Dropout = y_Pool[pool_subset_dropout]

        #compute the class predicted probabilities for the pool points
        predicted_probabilities = model.predict(X_Pool_Dropout,
                                                batch_size=batch_size,
                                                verbose=1)

        BvSB = 0
        #compute BvSB for each pool point
        for d in range(predicted_probabilities.shape[0]):
            D = predicted_probabilities[d, :]
            Z = heapq.nlargest(2, D)
            v = np.absolute(np.diff(Z))
            BvSB = np.append(BvSB, v)

        # Finding the minimum "Queries" probability difference values
        BvSB = BvSB[1:]
        a_1d = BvSB.flatten()
        x_pool_index = a_1d.argsort()[-Queries:]

        # find maximum probability difference values
        # BvSB = BvSB[1:]
        # a_1d = BvSB.flatten()
        # x_pool_index = a_1d.argsort()[-Queries:][::-1]

        # THIS FINDS THE MINIMUM INDEX
        # a_1d = U_X.flatten()
Example #47
0
 def top(self, n):
     """Return (count, obs) tuples for the n most frequent observations."""
     return heapq.nlargest(n, [(v, k) for (k, v) in self.dictionary.items()])
Example #48
0
 def find_similar(self, t, n=5):
     e = self.get_embedding(t)
     sim = CosineSimilarity()
     similarities = [sim(e, se).item() for se in self.embeddings]
     top = heapq.nlargest(5, enumerate(similarities), key=lambda p: p[1])
     return top
Example #49
0
import heapq

list = [23, 2, 6, 0, 14, 7]

stocks = [{
    'ticker': 'AAPL',
    'price': 201
}, {
    'ticker': 'GOOG',
    'price': 800
}, {
    'ticker': 'FB',
    'price': 54
}, {
    'ticker': 'MSFT',
    'price': 333
}, {
    'ticker': 'TUNA',
    'price': 789
}]

print(heapq.nlargest(1, stocks, key=lambda stocks: stocks["price"]))
Example #50
0
def equalWeightsSP500Select(features):
    """
    1. Get the number of stocks in each sector
    2. Allocate capital according to the proportion of stocks in that sector
    3. Randomly pick different stocks in the sector
    4. Rebalance every time when there is changes in S&P constituents and pick different stocks to reduce variance
    """

    # Get the inclusion matrix
    inclusionMatrix = getTickersSP500(ticker=features.tickers,
                                      startDate=features.startDate,
                                      endDate=features.endDate,
                                      asMatrix=True)

    # Create dataframe with tickers as columns
    px = features.subset(fields='bb_live', asDataFeatures=True)
    selection = inclusionMatrix.copy()
    selection.iloc[:, :] = 0.0  #initialise
    rebalDates = inclusionMatrix.index[~(
        inclusionMatrix == inclusionMatrix.shift(1)).all(axis=1)].tolist()
    selection = selection.reindex(index=rebalDates)
    save = selection.iloc[0, :]

    # Loop through the rebalance dates
    for date in rebalDates:

        # If stock in portfolio no longer in S&P
        if (save * inclusionMatrix.loc[date, :]).sum() != 1:

            gics = getGICSDescription()
            sectors = gics.sector.unique()
            numofstock = []

            # Find out the number of stocks required for each sector
            for i in sectors:
                fins = gics[gics.sector == i].industry.unique().tolist()
                # For robustness
                tkr = getTickersSP500(ticker=features.tickers,
                                      industry=fins,
                                      zoom=str(date)).ticker.unique().tolist()
                numofstock.append(len(tkr))

            newnumofstock = [int(i / sum(numofstock) * 50) for i in numofstock]

            if sum(newnumofstock) != 50:
                for i in heapq.nlargest(50 - sum(newnumofstock),
                                        range(len(newnumofstock)),
                                        newnumofstock.__getitem__):
                    newnumofstock[i] += 1

            includedAtDate = (inclusionMatrix.loc[date, :] > 0.0).values

            # Add the stock into the portfolio if chosen
            for i, d in enumerate(sectors):
                fins = gics[gics.sector == d].industry.unique().tolist()
                tkr = getTickersSP500(ticker=features.tickers,
                                      industry=fins,
                                      zoom=str(date)).ticker.unique().tolist()
                tkr = [tk for tk, incl in zip(tkr, includedAtDate) if incl]
                random.seed(10)
                random.shuffle(tkr)
                selected = tkr[:newnumofstock[i]]
                selection.loc[date, selected] = 0.02

            save = selection.loc[date, selected]

    selection = selection[selection.sum(axis=1) == 1]

    return selection
 def most_common(self, n=None):
     if n is None:
         return sorted(self.items(), key=_itemgetter(1), reverse=True)
     return _heapq.nlargest(n, self.items(), key=_itemgetter(1))
Example #52
0
    def simStep(self, reward, state):
        numActive = int(self._activeRatio * self._numHidden)

        self._stimuli = np.dot(self._weightsFF, state)
        activations = self._stimuli - self._biases

        # Generate tuples for sorting
        heap = [(activations.item(0), 0)]

        for i in range(1, self._numHidden):
            heapq.heappush(heap, (activations.item(i), i))

        # Use sorted information for inhibition
        hiddenStatesPrev = copy(self._hiddenStates)

        self._sparsities = np.zeros((self._numHidden, 1))

        nLargest = heapq.nlargest(numActive, heap, key=itemgetter(0))

        # Inhibition
        for i in range(0, numActive):
            self._sparsities[nLargest[i][1]] = 1.0

        self._hiddenStates = np.multiply(self._sparsities, activations)

        # Q
        q = np.dot(self._weightsQ, self._hiddenStates).item(0)

        # Action
        action = np.tanh(np.dot(self._weightsAction, self._hiddenStates))

        actionExp = copy(action)

        for i in range(0, self._numAction):
            if np.random.rand() < self._noise:
                actionExp[i] = np.random.rand() * 2.0 - 1.0

        #actionExp = np.minimum(1.0, np.maximum(-1.0, action + np.random.randn(self._numAction, 1) * self._noise))

        # Reconstruction
        recon = np.dot(self._weightsFF.T, self._hiddenStates)

        delta = state - recon

        # Update
        self._weightsFF += self._alphaFF * np.dot(self._hiddenStates, delta.T)

        tdError = reward + self._gamma * q - self._prevV

        self._tracesQ = np.maximum(self._tracesQ * self._lambda,
                                   hiddenStatesPrev.T)

        self._weightsQ += self._alphaQ * tdError * self._tracesQ

        if tdError > 0.0:
            self._weightsAction += self._alphaAction * np.dot(
                self._actionDelta, hiddenStatesPrev.T)

        self._biases += self._alphaBias * (self._stimuli - self._biases)

        self._prevV = q

        self._actionDelta = actionExp - action

        return actionExp
Example #53
0
 def nlargest(self, n, *args, **kwargs):
     return heapq.nlargest(n, self, *args, **kwargs)
                WORD2COUNT[word] += 1
    for key in WORD2COUNT:
        WORD2COUNT[key] = WORD2COUNT[key] / max(WORD2COUNT.values())
    return WORD2COUNT


SENT2SCORE = {}
WORD2COUNT = word2count_score(TEXT)


def sent2count_score():
    '''Creating Dictionary for each sentence score using their word count score'''
    for sentence in SENTENCES:
        for word in nltk.word_tokenize(sentence.lower()):
            if word in WORD2COUNT:
                #            if len(sentence.split(' ')) < 25:
                if sentence not in SENT2SCORE:
                    SENT2SCORE[sentence] = WORD2COUNT[word]
                else:
                    SENT2SCORE[sentence] += WORD2COUNT[word]
    return SENT2SCORE


SENT2SCORE = sent2count_score()
# Selecting the top scored sentences for our extractive summary
BEST_SENTENCES = nlargest(120, SENT2SCORE, key=SENT2SCORE.get)

print('Following are the best rated sentences -->>')
for best_sentence in BEST_SENTENCES:
    print(best_sentence)
Example #55
0
def getDataForUrl(url):

    data = {}
    person_details = []
    personFound = False
    urllib.request.urlretrieve(url, os.getcwd() + '/' + str(1) + ".jpg")
    imgg2 = cv2.imread(os.getcwd() + '/' + str(1) + ".jpg")
    imgg = cv2.cvtColor(imgg2, cv2.COLOR_BGR2RGB)
    boxes = face_recognition.face_locations(imgg, model="hog")
    encodings = face_recognition.face_encodings(imgg, boxes)

    for encoding in encodings:
        matches = face_recognition.compare_faces(X, encoding)
        matchCount = {}
        for inx, m in enumerate(matches):
            if (m):
                if (y[inx] not in matchCount):
                    matchCount[y[inx]] = 1
                else:
                    matchCount[y[inx]] += 1
        print(matchCount)
        if (not bool(matchCount)):
            continue
        predictedName = max(matchCount.items(), key=operator.itemgetter(1))[0]
        print('Person identified: ' + predictedName)

        if (matchCount[predictedName] < 4):
            continue

        personFound = True
        for index, row in df.iterrows():
            if (row['Name'] == predictedName.rsplit(' ', 1)[0]):
                person_details.append(row.to_dict())

    data['Person details'] = person_details

    img = Image.open(os.getcwd() + '/' + str(1) + ".jpg")
    x = img.resize((224, 224))
    arr = np.array(x)
    arr2 = np.reshape(arr, (1, 224, 224, 3))
    arr2 = arr2 / 255

    arr3 = pre_model.predict(arr2)
    predd = model.predict(arr3)
    pred = predd[0].tolist()

    sortedd = heapq.nlargest(3, pred)
    g1 = pred.index(sortedd[0])
    g2 = pred.index(sortedd[1])
    g3 = pred.index(sortedd[2])
    data['Sports Guess 1'] = key[g1]
    data['Sports Guess 2'] = key[g2]
    data['Sports Guess 3'] = key[g3]

    standardResponse = {}
    successObject = {}
    successObject['code'] = 200
    if (personFound):
        successObject['message'] = 'Person found'
    else:
        successObject['message'] = 'Person not found'
    successObject['data'] = data
    standardResponse['success'] = successObject

    return standardResponse
Example #56
0
def get_average_points_per_position(position,
                                    players,
                                    topn,
                                    plot_output=False,
                                    print_output=False):
    """
    See if the AVT theory holds true - calculates means/std dev of the 1-topn best players at each position
    """
    # populate a list of values for the top N players at that position to perform statistics on later
    top_n_players = [[] for _ in range(topn)]
    for year in range(global_start_year, global_end_year):
        total_scores_for_year = [
            (players[player].get_total_score_for_year(year), player, year)
            for player in players
            if players[player].player_position == position
        ]
        top_n_pos = nlargest(topn, total_scores_for_year)
        for (ind, elem) in enumerate(top_n_pos):
            top_n_players[ind].append((elem[1], elem[2]))

    # calculate statistics
    x = []
    y = []
    e_season = []
    e_game = []
    for (ind, top_n_scores) in enumerate(top_n_players):
        # model = each season is a normally distributed variable, with each game as a sample
        # mean = get mean of all the total scores per player (storing player and year for top n scores)
        # std dev = (assumption is all seasons are independent) sum variances and square root sum
        # std dev per game = random sample, with mean = mean/16 and std dev = season std dev / sqrt(16)
        mean_season = round(
            np.mean([
                players[player].get_total_score_for_year(year)
                for (player, year) in top_n_scores
            ]), 3)
        mean_game = round(mean_season / 16.0, 3)
        stddev_season = round(
            np.sum([
                players[player].get_stddev_for_year(year)**2
                for (player, year) in top_n_scores
            ])**0.5, 3)
        stddev_game = round(stddev_season / 4.0,
                            3)  # divide by sqrt(n = 16 games) = 4
        if print_output:
            print(f"For {position}{ind+1}:")
            print(f"Mean: {mean_season} ({mean_game} points per game)")
            print(f"Std. Dev: {stddev_season} ({stddev_game} points per game)")
        x.append(ind + 1)
        y.append(mean_season)
        e_season.append(stddev_season)
        e_game.append(stddev_game)

    if plot_output:
        plt.errorbar(np.array(x),
                     np.array(y),
                     np.array(e_season),
                     linestyle='None',
                     marker='o',
                     capsize=2,
                     color="blue",
                     ecolor="red")
        plt.title(
            f"Average Points Scored by the Top {topn} {position}s Per Season ")
        plt.xlabel(f"{position} position at finish")
        xticks_arr = [str(num + 1) for num in range(topn)]
        xticks_arr.insert(0, "")
        plt.xticks(np.arange(topn + 1), xticks_arr)
        plt.ylabel("Points / Season")
        plt.show()

        plt.errorbar(np.array(x),
                     np.array(y) / 16,
                     np.array(e_game),
                     linestyle='None',
                     marker='o',
                     capsize=2,
                     color="blue",
                     ecolor="red")
        plt.title(
            f"Average Points Scored by the Top {topn} {position}s Per Game")
        plt.xlabel(f"{position} position at finish")
        xticks_arr = [str(num + 1) for num in range(topn)]
        xticks_arr.insert(0, "")
        plt.xticks(np.arange(topn + 1), xticks_arr)
        plt.ylabel("Points / Game")
        plt.show()

    return np.array(y), np.array(e_season)
Example #57
0
def getListMaxNumIndex(num_list,topk=5):
    max_num_index=list(map(num_list.index, heapq.nlargest(topk,num_list)))
    return max_num_index
Example #58
0
 def _rank(self, ranking, n):
     """ return the first n sentences with highest ranking """
     return nlargest(n, ranking, key=ranking.get)
Example #59
0
 def findKthLargestHeap(self, nums: List[int], k: int) -> int:
     return heapq.nlargest(k, nums)[-1]
Example #60
0
def maximumproduct(nums):
    import heapq
    a, b = heapq.nlargest(3, nums), heapq.nsmallest(2, nums)
    return max(a[0] * a[1] * a[2], a[0] * b[0] * b[1])