def reducer_final(self): mk_term_largest = heapq.nlargest(10,self.mk_top10_termlist) mk_top10term = [(key,int(count)) for count,key in mk_term_largest] ks_term_largest = heapq.nlargest(10,self.ks_top10_termlist) ks_top10term = [(key,int(count)) for count,key in ks_term_largest] yield None,('mk',mk_top10term) yield None,('ks',ks_top10term)
def count_mon_max_data_avg(fileObj, year, cat_flag): count = 0 flag = 0 avg_data = [] year_data = [] fd = open(fileObj, 'r') for line in fd.readlines(): a = re.split(',|\n| ', line) # Notice the None value if ((int)(a[YEAR]) == year): if (flag == 0): temp = (int)(a[MON]) flag = 1 if (len(a[cat_flag]) != 0): year_data.append((float)(a[cat_flag])) count = count + 1 if(temp != (int)(a[MON])): length = (int)(count * RATE) if (length == 0): length = 1 value = heapq.nlargest(length, year_data) avg = mean(value) avg_data.append(avg) year_data = [] count = 0 temp = (int)(a[MON]) length = (int)(count * RATE) value = heapq.nlargest(length, year_data) avg = mean(value) avg_data.append(avg) year_data = [] fd.close() return avg_data
def mapper_final_term_gettop10(self): mk_term_largest = heapq.nlargest(10,self.mk_term) ks_term_largest = heapq.nlargest(10,self.ks_term) for count,key in mk_term_largest: yield ('mk_heap',(count,key)) for count,key in ks_term_largest: yield ('ks_heap',(count,key))
def timeit_plot3D(data, xlabel='xlabel', ylabel='ylabel', **kwargs): """3D plot of timeit data, one chart per function. """ dataT = {} figs = [] series = kwargs.get('series', (0,1)) cmap = kwargs.get('cmap', cm.coolwarm) for k, v in data.items(): dataT[k] = zip(*v) fig = plt.figure() ax = fig.gca(projection='3d') X, Y, Z = dataT[k][series[0]], dataT[k][series[1]], dataT[k][-1] wide, tall = (max(X)-min(X)+1), (max(Y)-min(Y)+1) intervalX = max(X) - min(heapq.nlargest(2,set(X))) intervalY = max(Y) - min(heapq.nlargest(2,set(Y))) wide, tall = 1+wide/intervalX, 1+tall/intervalY X = np.reshape(X, [wide, tall]) Y = np.reshape(Y, [wide, tall]) # TODO: BUG: fix so that Z transposes with x & y reversed Z = np.reshape(Z, [wide, tall]) surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cmap, linewidth=0, antialiased=False) ax.zaxis.set_major_locator(LinearLocator(10)) ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f')) ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.set_title(substitute_titles(k,series)) fig.colorbar(surf, shrink=0.5, aspect=5) figs.append(fig) return figs
def filter_incoming_data(data): hor = data[0:4] vert = data[4:] deltaf_threshold = 60 # threshold proximity if ((max(hor) < deltaf_threshold) or max(vert) < deltaf_threshold): print "LOW: ", data return (-1,-1) # find largest two values in array # store index associated with values too # format: [(index1, largestval), (index2, secondlargestval)] maxHor = heapq.nlargest(2, enumerate(hor), key=lambda x: x[1]) maxVert = heapq.nlargest(2, enumerate(vert), key=lambda x: x[1]) # check that they're neighbors if (abs(maxHor[1][0] - maxHor[0][0]) != 1): print "Horizontal messed up", maxHor return (-1,-1) if (abs(maxVert[1][0] - maxVert[0][0]) != 1): print "Vertical messed up", maxVert return (-1,-1) if (maxHor[0][1] < 1) or (maxHor[1][1] < 1) or (maxVert[0][1] < 1) or (maxVert[1][1] < 1): print "value is neg" return (-1,-1) return (maxHor, maxVert)
def sortcat(n, *args): # check if each argument after n is a string for i in args: if not isinstance(i, str): print("Usage: sortcat(int, string1, string2,...)") return 1 from heapq import nlargest # check if n is -1 if n == -1: x = len(args) list = nlargest(x, args, key = len) # make a list of the n longest string arguments string = ("").join(list) # join them into a string print(string) return 0 # if n is an integer list = nlargest(n, args, key = len) # make a list of the n longest string arguments string = ("").join(list) # join them into a string print(string) return 0
def addElement(self, num): if len(self.minHeap) == 0: heapq.heappush(self.minHeap, num) else: minHeapTop = heapq.nsmallest(1,self.minHeap)[0] if len(heapq.nlargest(1,self.maxHeap)) == 0: heapq.heappush(self.maxHeap, min(num,minHeapTop)) heapq.heappushpop(self.minHeap, max(num,minHeapTop)) else: maxHeapTop = heapq.nlargest(1,self.maxHeap)[0] if num > minHeapTop: tmp = minHeapTop minHeapTop = num num = tmp elif num < maxHeapTop: tmp = maxHeapTop maxHeapTop = num num = tmp heapq.heappushpop(self.minHeap, minHeapTop) heapq._heappushpop_max(self.maxHeap, maxHeapTop) if len(self.minHeap) - len(self.maxHeap) > 0: heapq.heappush(self.maxHeap,num) else: heapq.heappush(self.minHeap,num)
def k_NN(): similar = {} for i, dict in enumerate(training_data): similar[i] = len(set(dict['feature_vector']).intersection(set(test_features))) knn = nlargest(k_neighbors, similar, key=similar.get) # print knn category = {'Tech':0, 'Non-Tech':0} for neighbor in knn: if training_data[neighbor]['label'] == 'Tech': category['Tech'] += 1 else: category['Non-Tech'] += 1 # print category label_knn = nlargest(1, category, key=category.get)[0] print "k-NN thinks it is a " + label_knn + " article." if label_knn=="Tech": return 1 else: return 0
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Print LDA model') parser.add_argument('model', help='trained model') args = parser.parse_args() with open(args.model) as m: model = cPickle.load(m) pm = model.pattern_model def dec(p): pattern = model.pattern_vocabulary[p] return '+'.join(model.morpheme_vocabulary[m] for m in pattern) patt_prob = ((pm.prob(p), p) for p in xrange(len(model.pattern_vocabulary))) for prob, p in heapq.nlargest(100, patt_prob): print(u'{0} {1}'.format(dec(p), prob).encode('utf8')) print('---------') for i, topic in enumerate(model.topic_word): print('Topic {0}'.format(i)) stem_topic = topic.base.stem_model word_prob = ((stem_topic.prob(w), w) for w in xrange(len(model.stem_vocabulary))) for prob, w in heapq.nlargest(10, word_prob): print(u'{0} {1}'.format(model.stem_vocabulary[w], prob).encode('utf8')) print('---------')
def _locate_pinch_points(self, pinch_count: int) -> None: """Locate the pinch points. Args: pinch_count: Number of pinch points. """ spendq = [] svgq = [] self.cashflow = collections.OrderedDict( sorted(self.cashflow.items(), key=lambda t: t[0])) last_k = None for k in self.cashflow.keys(): spend, svg, days = self.cashflow[k] if last_k: # Calculate running totals spend += self.cashflow[last_k][0] svg += self.cashflow[last_k][1] try: # Calculate and save spending and saving deltas spend_d = spend / days svg_d = (spend + svg) / days except (decimal.InvalidOperation, decimal.DivisionByZero): spend_d = svg_d = 0 self.cashflow[k] = [spend, svg, spend_d, svg_d] heapq.heappush(spendq, (spend_d, k)) heapq.heappush(svgq, (svg_d, k)) last_k = k self.pinch_points_spend = heapq.nlargest(pinch_count, spendq) self.pinch_points_spend = [item[1] for item in self.pinch_points_spend] self.pinch_points_svg = heapq.nlargest(pinch_count, svgq) self.pinch_points_svg = [item[1] for item in self.pinch_points_svg]
def getSkyline(self, buildings): """ :type buildings: List[List[int]] :rtype: List[List[int]] """ n = len(buildings) start = map(lambda x:x[0],buildings) end = map(lambda x:x[1], buildings) height = map(lambda x: x[2], buildings) s_t = zip(start, height, [0] * n) e_t = zip(end, height, [1] * n) total_t = s_t + e_t total_t = sorted(total_t, key=lambda x:(x[0],x[2])) h = [0] res = [] heapq.heapify(h) for t in total_t: if t[2] == 0: prev = heapq.nlargest(1,h)[0] if h else 0 heapq.heappush(h, t[1]) if prev < t[1]: res.append(t[:2]) elif t[2] == 1: h.remove(t[1]) heapq.heapify(h) top = heapq.nlargest(1,h) if heapq.nlargest(1,h) else [0] if top[0] < t[1]: res.append([t[0]] + top) return res
def run(self): logging.debug("====================\nInitializing Food Processor\n\n") start = datetime.now() try: logging.debug("====================\n Startiing mapping\n\n") MapperTask().run() logging.debug("====================\n Startiing reducing\n\n") ReducerTask().run() with open(os.path.join(settings.FILE_PATH, "category_reduce.txt"), "r") as reduce_file: content = simplejson.loads(reduce_file.read()) #heap resolves in o(n lg n) top_categories = heapq.nlargest(5, content, key=lambda k: content[k]) with open(os.path.join(settings.FILE_PATH, "food_reduce.txt"), "r") as reduce_file: content = simplejson.loads(reduce_file.read()) top_foods = heapq.nlargest(100, content, key=lambda k: content[k]) logging.info("\tTop Foods: {}".format(",".join(top_foods))) logging.info("\tTop categories: {}".format(",".join(top_categories))) except Exception, e: logging.exception("====================\nThere were some problemas while processing food information\n")
def addNum(self, num): """ Adds a num into the data structure. :type num: int :rtype: void """ smallValue = heapq.nlargest(1,self.small) largestValue = heapq.nsmallest(1,self.large) if num>=largestValue: heapq.heappush(self.large,num) self.mark1 +=1 else: heapq.heappush(self.small,num) self.mark2+=1 if self.mark1==2: temp = heapq.heappop(self.large) heapq.heappush(self.small,temp) self.mark1=0 if self.mark2==2: temp = heapq.nlargest(1,self.small)[0] self.small.remove(temp) heapq.heapify(self.small) heapq.heappush(self.large,temp) self.mark2=0
def proc_unigram_feats(): mat,key,regy,_ = rs.extract_feats([rs.unigram_feats]) inv_key = {v:k for k,v in key.items()} num_movies,num_words = mat.get_shape() movies = [(regy[i],i) for i in range(num_movies)] min_movies = heap.nsmallest(MOVIE_TARGET,movies) max_movies = heap.nlargest(MOVIE_TARGET,movies) tot_min = 0. tot_max = 0. for mv in min_movies: tot_min += mat[mv[1]].sum() for mv in max_movies: tot_max += mat[mv[1]].sum() fix = tot_max/tot_min diffs = np.zeros((num_words)) for mv in min_movies: diffs += -1.*fix*mat[mv[1]] for mv in max_movies: diffs += mat[mv[1]] with open("english.stop") as f: stop_words = set([line.strip() for line in f.readlines()]) words = [(diffs[0,i],inv_key[i]) for i in range(num_words) if inv_key[i] not in stop_words] worst_words = heap.nsmallest(WORD_TARGET, words) worst_words.sort() best_words = heap.nlargest(WORD_TARGET, words) best_words.sort() for wd in worst_words: print wd[1] + '\t' + str(wd[0]) print '---------------------------------' for wd in best_words: print wd[1] + '\t' + str(wd[0])
def get_median(self, n): if len(self.hmin) == len(self.hmax): if n <= self.median: heapq.heappush(self.hmin, n) self.median = heapq.nlargest(1, self.hmin)[0] else: assert (n > self.median) heapq.heappush(self.hmax, n) self.median = heapq.nsmallest(1, self.hmax)[0] elif len(self.hmin) < len(self.hmax): if n <= self.median: heapq.heappush(self.hmin, n) self.median = heapq.nlargest(1, self.hmin)[0] else: assert (n > self.median) item = heapq.heappop(self.hmax) heapq.heappush(self.hmin, item) heapq.heappush(self.hmax, n) self.median = heapq.nlargest(1, self.hmin)[0] else: assert (len(self.hmin) > len(self.hmax)) if n <= self.median: item = self._heappop_max(self.hmin) heapq.heappush(self.hmax, item) heapq.heappush(self.hmin, n) self.median = heapq.nlargest(1, self.hmin)[0] else: assert (n > self.median) heapq.heappush(self.hmax, n) self.median = heapq.nlargest(1, self.hmin)[0] return self.median
def SeqLenDis(indexes): lengths = [] for index in indexes: lengths.append(int(index[2]) - int(index[1])) print heapq.nlargest(30, lengths) plt.hist(lengths) plt.show()
def accuracy(self, data, convert=False, items=1): """Return the number of inputs in ``data`` for which the neural network outputs the correct result. The neural network's output is assumed to be the index of whichever neuron in the final layer has the highest activation. The flag ``convert`` should be set to False if the data set is validation or test data (the usual case), and to True if the data set is the training data. The need for this flag arises due to differences in the way the results ``y`` are represented in the different data sets. In particular, it flags whether we need to convert between the different representations. It may seem strange to use different representations for the different data sets. Why not use the same representation for all three data sets? It's done for efficiency reasons -- the program usually evaluates the cost on the training data and the accuracy on other data sets. These are different types of computations, and using different representations speeds things up. More details on the representations can be found in mnist_loader.load_data_wrapper. """ if convert: results = [(map(self.feedforward(x).tolist().index, heapq.nlargest(items, self.feedforward(x).tolist())), np.argmax(y)) for (x, y) in data] else: results = [(map(self.feedforward(x).tolist().index, heapq.nlargest(items, self.feedforward(x).tolist())), y) for (x, y) in data] counter = 0 for (x, y) in results: if y == x: counter = counter + 1 return counter
def print_top(fn): dict = {} # parse words from file and store them into dict dict = utils.parse_words_from_file(fn) # extract top 20 most common words from dict and print print heapq.nlargest(20, dict, key=lambda x: dict[x])
def test_nlargest(self): data = [(random.randrange(2000), i) for i in range(1000)] for f in (None, lambda x: x[0] * 547 % 2000): for n in (0, 1, 2, 10, 100, 400, 999, 1000, 1100): self.assertEqual(nlargest(n, data), sorted(data, reverse=True)[:n]) self.assertEqual(nlargest(n, data, key=f), sorted(data, key=f, reverse=True)[:n])
def heap_median_maintenance(read_in): starting_list = [] median = [] for i in read_in: starting_list.append(i) #If it's the first element being read in, that is the median if len(starting_list) == 1: low_heap = heapq.nsmallest(len(starting_list), starting_list) high_heap = heapq.nlargest(len(starting_list)-1, starting_list) #if even then split half way elif len(starting_list)%2 ==0: low_heap = heapq.nsmallest(len(starting_list)/2, starting_list) high_heap = heapq.nlargest(len(starting_list)/2, starting_list) #if odd give the larger portion to low heap else: low_list_amount = int(math.ceil(float(len(starting_list))/2)) high_list_amount = int(len(starting_list) - math.ceil(float(len(starting_list))/2)) low_heap = heapq.nsmallest(low_list_amount, starting_list) high_heap = heapq.nlargest(high_list_amount, starting_list) #print("Low heap has {} and high heap has {}".format(len(low_heap), len(high_heap))) #print("Low heap {}".format(low_heap)) #print("high heap {}".format(high_heap)) #print("Median is {}".format(heapq.nlargest(1, low_heap)[0])) #append median from the largest element of the low_heap median.append(heapq.nlargest(1, low_heap)[0]) return median
def mostcommon(self, n, which_kind): num = n print "--------------------------------------------------------------------" if which_kind == "continuous": if self.continuous == 0: print "No continuous connectives found!" else: print "Printing top " + str(n) + " continuous connectives:" #, which is " + str(num) + " items:" for key, freq in nlargest(num, self.continuous_dict.iteritems(), key=itemgetter(1)): print "\t", key.part_one[0], "occurs ", self.continuous_dict[key], " times, which is ", float(self.continuous_dict[key])/float(self.continuous)*100, " percent." elif which_kind == "discontinuous": if self.discontinuous == 0: print "No discontinuous connectives found!" else: print "Printing top " + str(n) + " discontinuous connectives:" #, which is " + str(num) + " items:" for key, freq in nlargest(num, self.discontinuous_dict.iteritems(), key=itemgetter(1)): print "\t", key.part_one[0], " ... ", key.part_two[0], "occurs ", self.discontinuous_dict[key], " times, which is ", float(self.discontinuous_dict[key])/float(self.discontinuous)*100, " percent." elif which_kind == "ambiguous": if self.ambiguous() == 0: print "No ambiguous connectives found!" else: print "Printing top " + str(n) + " ambiguous connectives:" #, which is " + str(num) + " items:" for key, freq in nlargest(num, self.ambiguous_dict.iteritems(), key=itemgetter(1)): print "\t", key.part_one[0], if key.sep == "continuous": print " ... ", key.part_two[0], print "occurs ", self.ambiguous_dict[key], " times, which is ", float(self.ambiguous_dict[key])/float(self.ambiguous())*100, " percent." return
def show_top(model): top_prefixes = heapq.nlargest(10, izip(model.base.theta_p.counts, model.prefix_vocabulary)) n_prefixes = sum(1 for c in model.base.theta_p.counts if c > 0) logging.info('Top prefixes (10/%d): %s', n_prefixes, ' '.join(prefix+':'+str(c) for c, prefix in top_prefixes)) top_suffixes = heapq.nlargest(10, izip(model.base.theta_s.counts, model.suffix_vocabulary)) n_suffixes = sum(1 for c in model.base.theta_s.counts if c > 0) logging.info('Top suffixes (10/%d): %s', n_suffixes, ' '.join(suffix+':'+str(c) for c, suffix in top_suffixes))
def DumpAudioDiagnostics(self, dir_name="./data/", top_k=10, bot_k=10): # utterance level diag import heapq utt_largest = heapq.nlargest(top_k, self.utt_feature, key=self.utt_feature.get) i = 0 for utt in utt_largest: utt_id = string.join(utt.split("_")[0:-2], "_") t_beg = float(utt.split("_")[-2]) / self.samp_period t_end = float(utt.split("_")[-1]) / self.samp_period file_id = self.list_files[self.map_utt_idx[utt_id]] out_file = "./data/" + repr(i) + "large_srate_" + os.path.basename(file_id).split(".")[0] + ".wav" util.cmdconvert(file_id, out_file, t_beg, t_end) i += 1 utt_smallest = heapq.nsmallest(bot_k, self.utt_feature, key=self.utt_feature.get) i = 0 for utt in utt_smallest: utt_id = string.join(utt.split("_")[0:-2], "_") t_beg = float(utt.split("_")[-2]) / self.samp_period t_end = float(utt.split("_")[-1]) / self.samp_period file_id = self.list_files[self.map_utt_idx[utt_id]] out_file = "./data/" + repr(i) + "small_srate_" + os.path.basename(file_id).split(".")[0] + ".wav" util.cmdconvert(file_id, out_file, t_beg, t_end) i += 1 # glob level diag glob_largest = heapq.nlargest(top_k, self.glob_feature, key=self.glob_feature.get) for utt_id in glob_largest: file_id = self.list_files[self.map_utt_idx[utt_id]] out_file = "./data/glob_large_srate_" + os.path.basename(file_id).split(".")[0] + ".wav" util.cmdconvert(file_id, out_file) glob_smallest = heapq.nsmallest(top_k, self.glob_feature, key=self.glob_feature.get) for utt_id in glob_smallest: file_id = self.list_files[self.map_utt_idx[utt_id]] out_file = "./data/glob_small_srate_" + os.path.basename(file_id).split(".")[0] + ".wav" util.cmdconvert(file_id, out_file)
def get_top_k_users(user, all_users, k, similarity_metric, is_user_generated=False): similarity_funct = user.get_similarity_funct(similarity_metric) # Discount the first user since it is the same as the one being # compared against if not is_user_generated: return heapq.nlargest(k+1, all_users, similarity_funct)[1:] else: return heapq.nlargest(k, all_users, similarity_funct)
def getmedian(l, r): # print('Median:', l, '-', rightq) if len(l) > len(r): return heapq.nlargest(1, l)[0] elif len(l) == len(r): return (heapq.nlargest(1, l)[0] + heapq.nsmallest(1, r)[0]) / 2 else: return heapq.nsmallest(1, r)[0]
def test(): list_test=list('abcde') heapq.heapify(list_test) logger.error(heapq.nlargest(3, list_test)) list_b = list('bcdef') heapq.heapify(list_b) list_res = [i for i in heapq.merge(list_test, list_b)] logger.error(heapq.nlargest(3, list_res))
def getClosestCases(list_of_label_files, list_of_similarity_xml_files, \ similarity, num_closest_cases): """Get the closest cases to the test case using some similarity metric and given a list of files that contain the similarity value. Parameters ---------- list_of_similarity_xml_files : list of file names (string) Each file name points to an xml file that contains the similarity value list_of_label_files : list of file names (string) Each file name points to a file that has a labeled image similarity : string Type of similarity used num_closest_cases : int The number of closest cases to return ... Returns ------- closest_cases : string 2D list with shape (2, num_cases) Contains the num_cases cases with highest similarity to the case being tested, and the associated similarity value ... """ #TODO: Account for different similarities num_training_cases = len(list_of_label_files) #Read the similarity values similarity_values = getMISimilarityVec(list_of_similarity_xml_files) print(similarity_values) #find highest matches indexes=[] for i in range(num_training_cases): indexes.append(i) if (similarity == "NCC"): nlargestvalues = heapq.nlargest(num_closest_cases, indexes, key=lambda \ i: abs(float(similarity_values[i]))) #take (from 0 to 10, assuming testint is not in trainng) else: nlargestvalues = heapq.nlargest(num_closest_cases, indexes, key=lambda \ i: (similarity_values[i])) #take (from 0 to 10, assuming testint is not in trainng) print(nlargestvalues) patient_atlas_labelmaps = [""]*num_closest_cases patient_atlas_similarity = [1.0]*num_closest_cases #Now store the num_cases in a 2D list for i in range(num_closest_cases): patient_atlas_labelmaps[i] = list_of_label_files[nlargestvalues[i]] print(patient_atlas_labelmaps[i]) patient_atlas_similarity[i] = similarity_values[nlargestvalues[i]] closest_cases = np.vstack((patient_atlas_labelmaps, patient_atlas_similarity)) return closest_cases
def _map(self, key, item): statistics = item[self.dname] total = item[self.tname] if total == 0.0: return None days = 0 weeks = 0 months = 0 data_day = {} data_week = {} data_month = {} sum_week = 0.0 sum_month = 0.0 for i in xrange(len(statistics)): stat = statistics[i] days = i sum_week += stat sum_month += stat data_day[days] = stat if days % 7 == 0: data_week[weeks] = sum_week weeks += 1 sum_week = 0.0 if days % 30 == 0: data_month[months] = sum_month months += 1 sum_month = 0.0 daily_top = nlargest(3, data_day.values()) weekly_top = nlargest(3, data_week.values()) monthly_top = nlargest(3, data_month.values()) daily_frac = None if len(daily_top) >= 3: daily_frac = (daily_top[0] / total, daily_top[1] / total, daily_top[2] / total) weekly_frac = None if len(weekly_top) >= 3: weekly_frac = (weekly_top[0] / total, weekly_top[1] / total, weekly_top[2] / total) monthly_frac = None if len(monthly_top) >= 3: monthly_frac = (monthly_top[0] / total, monthly_top[1] / total, monthly_top[2] / total) return (daily_frac, weekly_frac, monthly_frac)
def clientLatestSnapshot(self, onlyTs): clientTsList = self.subvolSplitTsList(self.clientSubvolList(withUUID=False)) if onlyTs is True: return heapq.nlargest(1, clientTsList)[0] else: for subvol in self.clientSubvolList(withUUID=False): if heapq.nlargest(1, clientTsList)[0] in subvol: logInfo("isLockfile(): Newest client subvolume is %s" % subvol) return subvol
def returnNLargest(self, n): # return the N largest values in the data stored in the heap. # if n is less than k, return n largest data # if not, return the data of size k if(n <= self.__k): return heapq.nlargest(n, self.__data) else: print "n is greater than the size of data" return heapq.nlargest(self.__k, self.__data)
def choose_playlist(self): if not self.candidate_playlists: return None print('genre_weights') # Note that, at this point, we only have genre information of five tracks per playlist (this # is the information that SoundCloud usually returns for playlist requests). tracks_genre_distr = list(self.get_free_tracks_genre_distr().items()) tracks_genre_distr.sort(key=lambda item: item[1]) self.genre_weights = {genre: 0.85**(rank+1) if genre not in IGNORE_GENRES and \ genre != 'others' and \ genre != 'unknown' else 0.0 for rank, (genre, _) in enumerate(tracks_genre_distr)} print('candidates') # The below scoring code is pretty slow, and the number of candidate playlists grows over # time. We can try to speed it up by sampling a random subset of candidates in each step. candidates = random.sample(list(self.candidate_playlists.items()), k=20000) print('scores') weights = [] for _, candidate in candidates: track_values = [] new_tracks = 0 for track_info in candidate['tracks']: if track_info['id'] not in self.tracks: new_tracks += 1 if self.is_complete_track_info(track_info): mapped_genre = map_genre(track_info['genre']) if mapped_genre == 'ignore': track_value = 0.0 else: track_value = 1.0 track_value *= 1.0 if self.is_free(track_info['license']) else 0.00005 track_value *= 1.0 if self.is_track_okay(track_info) else 0.01 track_value *= 1.0 if track_info['id'] not in self.tracks else 0.01 track_value *= self.genre_weights.get(mapped_genre, 0.0) track_values.append(track_value) if len(track_values) == 0: weights.append(0.0) else: size_mult = 2/(1+math.exp(-new_tracks/20))-1 new_ratio = new_tracks / len(candidate['tracks']) mean_score = new_ratio * np.mean(track_values) score = size_mult * mean_score weights.append(math.exp(10000.0 * score)) print('topk') candidates_weights = zip(candidates, weights) candidates_weights = heapq.nlargest(50, candidates_weights, key=lambda pair: pair[1]) weights = [pair[1] for pair in candidates_weights] #for item, weight in candidates_weights[-50:]: # print(f' {weight}\t' # f'{item[1]["genre_distr"]}\t' # f'{item[1]["freeness"]}\t' # f'{self.calc_genre_novelty(item[1]["genre_distr"])}') print('sample') choice_item, choice_weight = random.choices(candidates_weights, weights=weights)[0] print(f' playlist_id: {choice_item[0]}, ' f'weight: {choice_weight}') return choice_item[0]
# 获取最大最小的N个元素 import heapq nums = [1, 8, 2, 23, 7, -4, 18, 23, 42, 37, 2] print(heapq.nlargest(3, nums)) # [42, 37, 23] # 获取最大的三个数 print(heapq.nsmallest(1, nums)) # [-4] # 获取最小数
#!coding=utf-8 """ 查找最大或最小的N个元素 """ # 从一个集合中获得最大或者最小的N个元素列表 import heapq nums = [1, 8, 2, 23, 7, -4, 18, 23, 42, 37, 2] print heapq.nlargest(3, nums) # 维护一个最小堆 print heapq.nsmallest(3, nums) # 维护一个最大堆 portfolio = [ {'name': 'IBM', 'shares': 100, 'price': 91.1}, {'name': 'AAPL', 'shares': 50, 'price': 543.22}, {'name': 'FB', 'shares': 200, 'price': 21.09}, {'name': 'HPQ', 'shares': 35, 'price': 31.75}, {'name': 'YHOO', 'shares': 45, 'price': 16.35}, {'name': 'ACME', 'shares': 75, 'price': 115.65} ] cheap = heapq.nsmallest(3, portfolio, key=lambda s: s['price']) expensive = heapq.nlargest(3, portfolio, key=lambda s: s['price']) print cheap print expensive # 如果想在一个集合中最小或最大的N个元素,并且N小于集合元素数量,那么heapqq能提供很好 # 的性能,底层实现是堆,时间复杂度为O(logn) heap = list(nums) heapq.heapify(heap) # 把列表转换成堆,为最小堆
print('Making Dev preds') json_preds = {} json_preds['questions'] = [] num_docs = 0 for i in range(len(data['queries'])): num_docs += 1 dy.renew_cg() qtext = data['queries'][i]['query_text'] qwds, qvecs, qconv = model.MakeInputs(qtext) rel_scores = {} for j in range(len(data['queries'][i]['retrieved_documents'])): doc_id = data['queries'][i]['retrieved_documents'][j]['doc_id'] dtext = (docs[doc_id]['title'] + ' <title> ' + docs[doc_id]['abstractText']) dwds, dvecs, dconv = model.MakeInputs(dtext) bm25 = data['queries'][i]['retrieved_documents'][j][ 'norm_bm25_score'] efeats = model.GetExtraFeatures(qtext, dtext, bm25) efeats_vec = dy.inputVector(efeats) score = model.GetQDScore(qwds, qconv, dwds, dconv, efeats_vec) rel_scores[j] = score.value() top = heapq.nlargest(10, rel_scores, key=rel_scores.get) utils.JsonPredsAppend(json_preds, data, i, top) dy.renew_cg() utils.DumpJson(json_preds, 'abel_dev_preds_ep' + str(epoch) + '.json') print('Done')
loss="categorical_crossentropy", metrics=["categorical_accuracy"]) model.load_weights( '/data/scene/scene_classification_res_gpu/weights/scene_InRes.01-1.33.h5' ) if loadmode == 'model': model = load_model( '/data/scene/scene_classification_res_1dense/weights/scene_InRes_Dropout0.00_Lr1.00e-05_Densen1_01-2.87.h5' ) for index in range(len(val_image_paths)): imgpath = val_image_paths[index] ture_score = ture_val_scores[index] img = preprocess(imgpath, image_size) scores = model.predict(np.array([img]))[0] top_number = 1 top_key = heapq.nlargest(top_number, range(len(scores)), scores.take) print('---top_key', top_key) top_value = heapq.nlargest(top_number, scores) print('---top_value', top_value) if ture_score in top_key: accuracy += 1 else: basename = os.path.basename(imgpath) basename = basename.split('.')[0].split( '_')[0] + '_pre_' + str(top_key[0]) + '_f_' + str( top_value[0]) + '.jpg' outpath = os.path.join(outdir, basename) shutil.copy(imgpath, outpath) imgrecord.append(imgpath) scorerecord.append(scores) accuracys.append(accuracy / len(val_image_paths))
def prediction(): """ prediction interface :return: """ print("prediction") log_path = os.path.join(".", config['log_path']) logger = get_logger(log_path) map_file_path = "./pkl/" + domain + "/data.pkl" if os.path.isfile(map_file_path): with open(map_file_path, "rb") as f: id2mid, id2p, entity_entity_sim_Matrix, entity_relation_Adj, truth_label, \ train_entity_list, test_entity_list, valid_entity_list, entity_size, relation_size = pickle.load(f) config['entity_size'] = entity_size config['relation_size'] = relation_size else: id2mid, id2p, entity_entity_sim_Matrix, entity_relation_Adj, truth_label, train_entity_list, \ test_entity_list, valid_entity_list = data_reader(logger, config=config, domain=domain, entity_entity_topk=entity_knn_number) with open(map_file_path, "wb") as f: pickle.dump([ id2mid, id2p, entity_entity_sim_Matrix, entity_relation_Adj, truth_label, train_entity_list, test_entity_list, valid_entity_list, config['entity_size'], config['relation_size'] ], f) test_data = (entity_relation_Adj, entity_entity_sim_Matrix, test_entity_list, truth_label) test_manager = BatchManager(test_data, config['batch_size'], "test") tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True config['decay_steps'] = 100 config['ckpt_path'] = "result/" + datasettype + "/" + domain + "/" rec_results = {} with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, config['ckpt_path'], config, logger) for batch in test_manager.iter_batch(): # each batch is an entity entity_relation_Adj, entity_entity_sim_Matrix, input_entity, input_relation, label = batch relation_z = model.run_step(sess, False, batch) relation_z = relation_z[0] reserved_prop_list = [] for p in range(len(entity_relation_Adj[input_entity[0]])): if entity_relation_Adj[input_entity[0]][p] >= 1: reserved_prop_list.append(p) entityid = input_entity[0] for index in range(len(input_entity)): # each batch is an entity # entityid = input_entity[index] relationid = input_relation[index] if entity_relation_Adj[entityid][ relationid] >= 1: # delete pre-reserved properties relation_z[index] = -2.0 map_property_score = { key: value for key, value in enumerate(relation_z) } exps_list = heapq.nlargest(k, map_property_score, key=map_property_score.get) rec_results[id2mid[entityid]] = [id2p[pid] for pid in exps_list] return rec_results
def topmost(img, top, title=''): n_labels = set(img.ravel()) print(title, heapq.nlargest(top, n_labels)) return int(np.max(list(n_labels))), int(np.mean(list(n_labels)))
def closest_to_vec(self, vec, n=10): # scores = ((self.m.dot(vec) + 1.0) / 2) scores = self.m.dot(vec) return heapq.nlargest(n, zip(scores, self.iw))
def evaluateSimilarityJSON(screen_name,new_tweet): sentiClassA = cs.sentiClass(new_tweet) rake_object = RAKE.Rake(stoplist_file) wiki.set_lang("en") keyword = rake_object.run(new_tweet) # print keyword # print wiki.summary(str(keyword[0][0])).encode("utf-8") # try : # summ_wiki = wiki.summary(str(keyword[0][0])).encode("utf-8") # except (wiki.exceptions.PageError,wiki.exceptions.DisambiguationError) as e: # try: # if keyword[1][1] and keyword[1][1] > 1: # summ_wiki = wiki.summary(str(keyword[1][0])).encode("utf-8") # except (wiki.exceptions.PageError,wiki.exceptions.DisambiguationError) as e: # summ_wiki = str(keyword[1][0]) # aug_tweet = new_tweet.encode('utf-8') + " " + summ_wiki # else : # aug_tweet = new_tweet.encode('utf-8') + " " + summ_wiki # print '\nevaluateSimilarity for tweet:', aug_tweet # preprocessed_msg = cs.slangCleanser(aug_tweet) preprocessed_msg = cs.slangCleanser(new_tweet) preprocessed_msg = cs.lemmatizingText(preprocessed_msg) ldaScoreA = cs.countDocLDA(preprocessed_msg) with open(str(DIR)+'/'+str(screen_name)+'_datasets/'+screen_name+'_followers.txt') as f: followers = f.read().splitlines() tweet_counter = 0 recommendation_list = collections.defaultdict(list) recommendation_list_np = collections.defaultdict(list) recommendation_list_recent = collections.defaultdict(list) recommendation_list_rt = collections.defaultdict(list) recommendation_list_hashtag = collections.defaultdict(list) recommendation_list_senti = collections.defaultdict(list) recommendation_list_prof = collections.defaultdict(list) recommendation_list_c = collections.defaultdict(list) recommendation_list_fnr = collections.defaultdict(list) recommendation_list_csi = collections.defaultdict(list) recommendation_list_cb = collections.defaultdict(list) recommendation_list_eb = collections.defaultdict(list) for user in followers: if os.path.exists(str(DIR)+'/'+str(screen_name)+'_datasets/'+str(user)+'/'+ str(user) +'.txt'): with open(str(DIR)+'/'+str(screen_name)+'_datasets/'+str(user)+'/'+ str(user) +'.txt') as f: all_tweets = f.read().splitlines() if len(all_tweets) < 2: print "pass", user continue print 'estimating for user: '******'/'+str(screen_name)+'_datasets/'+ str(user) +'/'+ str(user) +'_UserProfilingLDA.json'): with open(str(DIR)+'/'+str(screen_name)+'_datasets/'+ str(user) +'/'+ str(user) +'_UserProfilingLDA.json') as json_data: d = json.load(json_data) #User profile description relevance. capture user profile desciption start here: if d['profile'] != "": profileInterest = 1-cs.JSD_dist(d['profile'],ldaScoreA) else: profileInterest = 0 #count Recent Hashtag tweets Score start here: if d['hashtag'] != "": hashtagsInterest = 1-cs.JSD_dist(d['hashtag'],ldaScoreA) else: hashtagsInterest = 0 #count Recent Tweets Score start here: if d['recent'] != "": recentInterest = 1-cs.JSD_dist(d['recent'],ldaScoreA) if os.path.exists(str(DIR)+'/'+str(screen_name)+'_datasets/'+str(user)+'/'+ str(user) +'.txt'): #sentiment score features "Open file again for sentiment based" with open(str(DIR)+'/'+str(screen_name)+'_datasets/'+str(user)+'/'+ str(user) +'.txt') as f: data_tweets = f.read().splitlines() sentiScore = cs.sentiInterest(data_tweets,ldaScoreA,sentiClassA) sentiInterest = cs.sentiInterest(data_tweets,ldaScoreA,sentiClassA) "Open file for EAR cosine_sim" with open(str(DIR)+'/'+str(screen_name)+'_datasets/'+str(user)+'/'+ str(user) +'.txt') as f: aggregate_tweets = f.read().replace('\n', ' ') cosine_sim = cs.cosine_sim(new_tweet,aggregate_tweets) else: recentInterest = 0 continue #count recent Retweet Score start here: if d['sharing'] != "": sharingInt = 1-cs.JSD_dist(d['sharing'],ldaScoreA) num_rt = int(d['num_rt']) else: sharingInt = 0 num_rt = 0 #count number of follower Score start here: num_followers = int(d['num_followers']) empirical_based = num_followers*(cosine_sim*int(num_rt)) CSI = (0.4*(recentInterest)) + (0.3*(sharingInt)) + (0.3*(hashtagsInterest)) Topic_based = (0.25*(recentInterest)) + (0.2*(sharingInt)) + (0.2*(hashtagsInterest) + (0.15*(profileInterest))) #If new tweet polarity is neutral, don’t consider sentiment feature #If new tweet polarity is neutral, don’t consider sentiment feature if sentiClassA == "NEUTRAL": sentiScore = 0 SeAT = Topic_based + (0.25*sentiScore) SeAT_NP = CSI + sentiScore # print 'SeAT: ', SeAT,'; recentInterest: ', recentInterest,'; sharingInt: ', sharingInt,'; hashtagsInterest: ', hashtagsInterest,'; profileInterest: ', profileInterest,'; sentiInterest: ', sentiInterest print "--------\n" recommendation_list[user].append(SeAT) recommendation_list_np[user].append(SeAT_NP) recommendation_list_recent[user].append(recentInterest) recommendation_list_rt[user].append(sharingInt) recommendation_list_hashtag[user].append(hashtagsInterest) recommendation_list_senti[user].append(sentiInterest) recommendation_list_prof[user].append(profileInterest) recommendation_list_csi[user].append(CSI) recommendation_list_fnr[user].append(num_followers) recommendation_list_eb[user].append(empirical_based) # Proposed System top_15_SeAT = heapq.nlargest(15, recommendation_list, key=recommendation_list.get) top_15_SeAT_NP = heapq.nlargest(15, recommendation_list_np, key=recommendation_list.get) top_15_rt = heapq.nlargest(15, recommendation_list_rt, key=recommendation_list_rt.get) top_15_hashtag = heapq.nlargest(15, recommendation_list_hashtag, key=recommendation_list_hashtag.get) top_15_senti = heapq.nlargest(15, recommendation_list_senti, key=recommendation_list_senti.get) top_15_prof = heapq.nlargest(15, recommendation_list_prof, key=recommendation_list_prof.get) top_15_recent = heapq.nlargest(15, recommendation_list_recent, key=recommendation_list_recent.get) top_15_csi = heapq.nlargest(15, recommendation_list_csi, key=recommendation_list_csi.get) top_15_fnr = heapq.nlargest(15, recommendation_list_fnr, key=recommendation_list_fnr.get) top_15_eb = heapq.nlargest(15, recommendation_list_eb, key=recommendation_list_eb.get) with open(str(RESULT_PATH)+'/result_'+str(screen_name)+'.csv', 'a') as csvfile: evaluation = csv.writer(csvfile,quoting=csv.QUOTE_MINIMAL, lineterminator='\n') evaluation.writerow([new_tweet,top_15_SeAT,top_15_SeAT_NP,top_15_csi,top_15_eb,top_15_fnr,top_15_recent,top_15_rt,top_15_hashtag,top_15_senti,top_15_prof])
if len(self.max_heap) == len(self.min_heap): return (self.max_heap[0] + self.min_heap[0]) / 2 else: return self.max_heap[0] if __name__ == '__main__': mf = MedianFinder() mf.add_num(5) mf.add_num(4) mf.add_num(9) mf.add_num(7) mf.add_num(3) print(mf.min_heap) print(mf.max_heap) print(mf.find_median()) import heapq # 向堆中插入元素,heapq会维护列表heap中的元素保持堆的性质 heapq.heappush() # heapq把列表x转换成堆 heapq.heapify() # 从可迭代的迭代器中返回最大的n个数,可以指定比较的key heapq.nlargest() # 从可迭代的迭代器中返回最小的n个数,可以指定比较的key heapq.nsmallest() # 从堆中删除元素,返回值是堆中最小或者最大的元素 heapq.heappop()
def evaluate(sess, model, name, data, logger, id2mid, id2p): logger.info("evaluate data:{}".format(name)) all_precision = 0 all_ndcg = 0 test_entity_cnt = 0 all_map = 0 all_ndcg_topall = 0 for batch in data.iter_batch(): entity_relation_Adj, entity_entity_sim_Matrix, input_entity, input_relation, label = batch relation_z = model.run_step(sess, False, batch) # batch_size*1 relation_z = relation_z[0] test_entity_cnt += 1 # each batch is an entity reserved_prop_list = [] for p in range(len(entity_relation_Adj[input_entity[0]])): if entity_relation_Adj[input_entity[0]][p] >= 1: reserved_prop_list.append(p) for index in range(len(input_entity)): entityid = input_entity[index] relationid = input_relation[index] if entity_relation_Adj[entityid][relationid] >= 1: relation_z[index] = -2.0 # exclude pre-reserved properties truth = label truth_set = set() for a in range(len(truth)): if truth[a] == 1 and a not in reserved_prop_list: # exclude pre-reserved properties truth_set.add(a) map_property_score = { key: value for key, value in enumerate(relation_z) } exps_list = heapq.nlargest(k, map_property_score, key=map_property_score.get) precision = precision_at_k(exps_list, truth_set, k) ndcg = ndcg_at_k(exps_list, truth_set, k) ranklist_topall = sorted(map_property_score.items(), key=lambda d: d[1], reverse=True) ranklist_topall = [key for key, value in ranklist_topall] map_topall = ap(ranklist_topall, truth_set) ndcg_topall = ndcg_at_k(ranklist_topall, truth_set, 10000) all_precision += precision all_ndcg += ndcg all_map += map_topall all_ndcg_topall += ndcg_topall all_precision = all_precision / test_entity_cnt all_ndcg = all_ndcg / test_entity_cnt all_map = all_map / test_entity_cnt all_ndcg_topall = all_ndcg_topall / test_entity_cnt if name == "valid": best_dev_precision = model.best_dev_precision.eval() best_dev_ndcg = model.best_dev_ndcg.eval() if all_precision >= best_dev_precision and all_ndcg >= best_dev_ndcg: tf.assign(model.best_dev_precision, all_precision).eval() tf.assign(model.best_dev_ndcg, all_ndcg).eval() logger.info("new best dev precision at {} :{:>.5f}".format( k, all_precision)) logger.info("new best dev ndcg at {} :{:>.5f}".format(k, all_ndcg)) return all_precision > best_dev_precision and all_ndcg > best_dev_ndcg elif name == "test": best_test_precision = model.best_test_precision.eval() best_test_ndcg = model.best_test_ndcg.eval() if all_precision >= best_test_precision and all_ndcg >= best_test_ndcg: tf.assign(model.best_test_precision, all_precision).eval() tf.assign(model.best_test_ndcg, all_ndcg).eval() logger.info("new best test precision at {} :{:>.5f}".format( k, all_precision)) logger.info("new best test ndcg at {} :{:>.5f}".format( k, all_ndcg)) return all_precision, all_ndcg, all_map, all_ndcg_topall
if (val_frecuencia == None): val_frecuencia = 0 else: puntaje_final += val_frecuencia return puntaje_final cadena_hex = "1b37373331363f78151b7f2b783431333d78397828372d363c78373e783a393b3736" #cadena = cadena_hex.decode("hex") cadena = bytes.fromhex(cadena_hex).decode('utf-8') spell = SpellChecker() res = '' puntaje_final = 0 puntaje_final = [] for i in range(256): for j in cadena: res_byte = ord(j) ^ i res += chr(res_byte) puntaje_actual = analiza_frecuencia(res) puntaje_final.append((puntaje_actual, res, i)) res = '' ult = heapq.nlargest(3, puntaje_final) for i in range(3): #print(ult[i][1]) palabras = ult[i][1].split(' ') mejor_palabra = spell.known(palabras) if (mejor_palabra): print(ult[i][1]) break
import heapq as heap L = [] heap.heappush(L, 20) heap.heappush(L, 14) heap.heappush(L, 5) heap.heappush(L, 15) heap.heappush(L, 10) heap.heappush(L, 2) print(L) print(heap.heappop(L)) print(L) print(heap.heappushpop(L, 18)) print(L) L1 = heap.nlargest(3, L) print(L1) L2 = heap.nsmallest(3, L) print(L2) L3 = [20, 14, 2, 15, 10, 21] print(L3) heap.heapify(L3) print(L3)
def preprocesser(vals, refine=2, Rm_Outliers=False, Filter=True, Median=False, Mean=True): # Determine spatial resolution resolution = vals.shape[0] if Rm_Outliers: # Identify and remove outliers outlier_buffer = 5 vals_list = vals.reshape((resolution * resolution, )) vals_mins = heapq.nsmallest(outlier_buffer, vals_list) vals_maxes = heapq.nlargest(outlier_buffer, vals_list) # Cap max and min vals_min = np.max(vals_mins) vals_max = np.min(vals_maxes) # Trim outliers over = (vals > vals_max) under = (vals < vals_min) # Remove outliers vals[over] = vals_max vals[under] = vals_min else: vals_min = np.max(vals) vals_max = np.min(vals) if Filter: # Apply median/mean filter if Median: vals = median_filter(vals) if Mean: vals = mean_filter(vals) # Create grid start = 0.0 end = 1.0 x = np.linspace(start, end, resolution) y = np.linspace(start, end, resolution) [X, Y] = np.meshgrid(x, y) interp_vals = interp2d(x, y, vals, kind='cubic') # Create refined grid plot_start = 0.0 plot_end = 1.0 plot_x = np.linspace(plot_start, plot_end, refine * resolution) plot_y = np.linspace(plot_start, plot_end, refine * resolution) [plot_X, plot_Y] = np.meshgrid(plot_x, plot_y) vals_int_values = interp_vals(plot_x, plot_y) return vals_int_values, plot_X, plot_Y
def extract(query, choices, *, scorer=quick_ratio, score_cutoff=0, limit=10): it = _extraction_generator(query, choices, scorer, score_cutoff) key = lambda t: t[1] if limit is not None: return heapq.nlargest(limit, it, key=key) return sorted(it, key=key, reverse=True)
pool_subset = 2000 pool_subset_dropout = np.asarray( random.sample(range(0, X_Pool.shape[0]), pool_subset)) X_Pool_Dropout = X_Pool[pool_subset_dropout, :, :, :] y_Pool_Dropout = y_Pool[pool_subset_dropout] #compute the class predicted probabilities for the pool points predicted_probabilities = model.predict(X_Pool_Dropout, batch_size=batch_size, verbose=1) BvSB = 0 #compute BvSB for each pool point for d in range(predicted_probabilities.shape[0]): D = predicted_probabilities[d, :] Z = heapq.nlargest(2, D) v = np.absolute(np.diff(Z)) BvSB = np.append(BvSB, v) # Finding the minimum "Queries" probability difference values BvSB = BvSB[1:] a_1d = BvSB.flatten() x_pool_index = a_1d.argsort()[-Queries:] # find maximum probability difference values # BvSB = BvSB[1:] # a_1d = BvSB.flatten() # x_pool_index = a_1d.argsort()[-Queries:][::-1] # THIS FINDS THE MINIMUM INDEX # a_1d = U_X.flatten()
def top(self, n): """Return (count, obs) tuples for the n most frequent observations.""" return heapq.nlargest(n, [(v, k) for (k, v) in self.dictionary.items()])
def find_similar(self, t, n=5): e = self.get_embedding(t) sim = CosineSimilarity() similarities = [sim(e, se).item() for se in self.embeddings] top = heapq.nlargest(5, enumerate(similarities), key=lambda p: p[1]) return top
import heapq list = [23, 2, 6, 0, 14, 7] stocks = [{ 'ticker': 'AAPL', 'price': 201 }, { 'ticker': 'GOOG', 'price': 800 }, { 'ticker': 'FB', 'price': 54 }, { 'ticker': 'MSFT', 'price': 333 }, { 'ticker': 'TUNA', 'price': 789 }] print(heapq.nlargest(1, stocks, key=lambda stocks: stocks["price"]))
def equalWeightsSP500Select(features): """ 1. Get the number of stocks in each sector 2. Allocate capital according to the proportion of stocks in that sector 3. Randomly pick different stocks in the sector 4. Rebalance every time when there is changes in S&P constituents and pick different stocks to reduce variance """ # Get the inclusion matrix inclusionMatrix = getTickersSP500(ticker=features.tickers, startDate=features.startDate, endDate=features.endDate, asMatrix=True) # Create dataframe with tickers as columns px = features.subset(fields='bb_live', asDataFeatures=True) selection = inclusionMatrix.copy() selection.iloc[:, :] = 0.0 #initialise rebalDates = inclusionMatrix.index[~( inclusionMatrix == inclusionMatrix.shift(1)).all(axis=1)].tolist() selection = selection.reindex(index=rebalDates) save = selection.iloc[0, :] # Loop through the rebalance dates for date in rebalDates: # If stock in portfolio no longer in S&P if (save * inclusionMatrix.loc[date, :]).sum() != 1: gics = getGICSDescription() sectors = gics.sector.unique() numofstock = [] # Find out the number of stocks required for each sector for i in sectors: fins = gics[gics.sector == i].industry.unique().tolist() # For robustness tkr = getTickersSP500(ticker=features.tickers, industry=fins, zoom=str(date)).ticker.unique().tolist() numofstock.append(len(tkr)) newnumofstock = [int(i / sum(numofstock) * 50) for i in numofstock] if sum(newnumofstock) != 50: for i in heapq.nlargest(50 - sum(newnumofstock), range(len(newnumofstock)), newnumofstock.__getitem__): newnumofstock[i] += 1 includedAtDate = (inclusionMatrix.loc[date, :] > 0.0).values # Add the stock into the portfolio if chosen for i, d in enumerate(sectors): fins = gics[gics.sector == d].industry.unique().tolist() tkr = getTickersSP500(ticker=features.tickers, industry=fins, zoom=str(date)).ticker.unique().tolist() tkr = [tk for tk, incl in zip(tkr, includedAtDate) if incl] random.seed(10) random.shuffle(tkr) selected = tkr[:newnumofstock[i]] selection.loc[date, selected] = 0.02 save = selection.loc[date, selected] selection = selection[selection.sum(axis=1) == 1] return selection
def most_common(self, n=None): if n is None: return sorted(self.items(), key=_itemgetter(1), reverse=True) return _heapq.nlargest(n, self.items(), key=_itemgetter(1))
def simStep(self, reward, state): numActive = int(self._activeRatio * self._numHidden) self._stimuli = np.dot(self._weightsFF, state) activations = self._stimuli - self._biases # Generate tuples for sorting heap = [(activations.item(0), 0)] for i in range(1, self._numHidden): heapq.heappush(heap, (activations.item(i), i)) # Use sorted information for inhibition hiddenStatesPrev = copy(self._hiddenStates) self._sparsities = np.zeros((self._numHidden, 1)) nLargest = heapq.nlargest(numActive, heap, key=itemgetter(0)) # Inhibition for i in range(0, numActive): self._sparsities[nLargest[i][1]] = 1.0 self._hiddenStates = np.multiply(self._sparsities, activations) # Q q = np.dot(self._weightsQ, self._hiddenStates).item(0) # Action action = np.tanh(np.dot(self._weightsAction, self._hiddenStates)) actionExp = copy(action) for i in range(0, self._numAction): if np.random.rand() < self._noise: actionExp[i] = np.random.rand() * 2.0 - 1.0 #actionExp = np.minimum(1.0, np.maximum(-1.0, action + np.random.randn(self._numAction, 1) * self._noise)) # Reconstruction recon = np.dot(self._weightsFF.T, self._hiddenStates) delta = state - recon # Update self._weightsFF += self._alphaFF * np.dot(self._hiddenStates, delta.T) tdError = reward + self._gamma * q - self._prevV self._tracesQ = np.maximum(self._tracesQ * self._lambda, hiddenStatesPrev.T) self._weightsQ += self._alphaQ * tdError * self._tracesQ if tdError > 0.0: self._weightsAction += self._alphaAction * np.dot( self._actionDelta, hiddenStatesPrev.T) self._biases += self._alphaBias * (self._stimuli - self._biases) self._prevV = q self._actionDelta = actionExp - action return actionExp
def nlargest(self, n, *args, **kwargs): return heapq.nlargest(n, self, *args, **kwargs)
WORD2COUNT[word] += 1 for key in WORD2COUNT: WORD2COUNT[key] = WORD2COUNT[key] / max(WORD2COUNT.values()) return WORD2COUNT SENT2SCORE = {} WORD2COUNT = word2count_score(TEXT) def sent2count_score(): '''Creating Dictionary for each sentence score using their word count score''' for sentence in SENTENCES: for word in nltk.word_tokenize(sentence.lower()): if word in WORD2COUNT: # if len(sentence.split(' ')) < 25: if sentence not in SENT2SCORE: SENT2SCORE[sentence] = WORD2COUNT[word] else: SENT2SCORE[sentence] += WORD2COUNT[word] return SENT2SCORE SENT2SCORE = sent2count_score() # Selecting the top scored sentences for our extractive summary BEST_SENTENCES = nlargest(120, SENT2SCORE, key=SENT2SCORE.get) print('Following are the best rated sentences -->>') for best_sentence in BEST_SENTENCES: print(best_sentence)
def getDataForUrl(url): data = {} person_details = [] personFound = False urllib.request.urlretrieve(url, os.getcwd() + '/' + str(1) + ".jpg") imgg2 = cv2.imread(os.getcwd() + '/' + str(1) + ".jpg") imgg = cv2.cvtColor(imgg2, cv2.COLOR_BGR2RGB) boxes = face_recognition.face_locations(imgg, model="hog") encodings = face_recognition.face_encodings(imgg, boxes) for encoding in encodings: matches = face_recognition.compare_faces(X, encoding) matchCount = {} for inx, m in enumerate(matches): if (m): if (y[inx] not in matchCount): matchCount[y[inx]] = 1 else: matchCount[y[inx]] += 1 print(matchCount) if (not bool(matchCount)): continue predictedName = max(matchCount.items(), key=operator.itemgetter(1))[0] print('Person identified: ' + predictedName) if (matchCount[predictedName] < 4): continue personFound = True for index, row in df.iterrows(): if (row['Name'] == predictedName.rsplit(' ', 1)[0]): person_details.append(row.to_dict()) data['Person details'] = person_details img = Image.open(os.getcwd() + '/' + str(1) + ".jpg") x = img.resize((224, 224)) arr = np.array(x) arr2 = np.reshape(arr, (1, 224, 224, 3)) arr2 = arr2 / 255 arr3 = pre_model.predict(arr2) predd = model.predict(arr3) pred = predd[0].tolist() sortedd = heapq.nlargest(3, pred) g1 = pred.index(sortedd[0]) g2 = pred.index(sortedd[1]) g3 = pred.index(sortedd[2]) data['Sports Guess 1'] = key[g1] data['Sports Guess 2'] = key[g2] data['Sports Guess 3'] = key[g3] standardResponse = {} successObject = {} successObject['code'] = 200 if (personFound): successObject['message'] = 'Person found' else: successObject['message'] = 'Person not found' successObject['data'] = data standardResponse['success'] = successObject return standardResponse
def get_average_points_per_position(position, players, topn, plot_output=False, print_output=False): """ See if the AVT theory holds true - calculates means/std dev of the 1-topn best players at each position """ # populate a list of values for the top N players at that position to perform statistics on later top_n_players = [[] for _ in range(topn)] for year in range(global_start_year, global_end_year): total_scores_for_year = [ (players[player].get_total_score_for_year(year), player, year) for player in players if players[player].player_position == position ] top_n_pos = nlargest(topn, total_scores_for_year) for (ind, elem) in enumerate(top_n_pos): top_n_players[ind].append((elem[1], elem[2])) # calculate statistics x = [] y = [] e_season = [] e_game = [] for (ind, top_n_scores) in enumerate(top_n_players): # model = each season is a normally distributed variable, with each game as a sample # mean = get mean of all the total scores per player (storing player and year for top n scores) # std dev = (assumption is all seasons are independent) sum variances and square root sum # std dev per game = random sample, with mean = mean/16 and std dev = season std dev / sqrt(16) mean_season = round( np.mean([ players[player].get_total_score_for_year(year) for (player, year) in top_n_scores ]), 3) mean_game = round(mean_season / 16.0, 3) stddev_season = round( np.sum([ players[player].get_stddev_for_year(year)**2 for (player, year) in top_n_scores ])**0.5, 3) stddev_game = round(stddev_season / 4.0, 3) # divide by sqrt(n = 16 games) = 4 if print_output: print(f"For {position}{ind+1}:") print(f"Mean: {mean_season} ({mean_game} points per game)") print(f"Std. Dev: {stddev_season} ({stddev_game} points per game)") x.append(ind + 1) y.append(mean_season) e_season.append(stddev_season) e_game.append(stddev_game) if plot_output: plt.errorbar(np.array(x), np.array(y), np.array(e_season), linestyle='None', marker='o', capsize=2, color="blue", ecolor="red") plt.title( f"Average Points Scored by the Top {topn} {position}s Per Season ") plt.xlabel(f"{position} position at finish") xticks_arr = [str(num + 1) for num in range(topn)] xticks_arr.insert(0, "") plt.xticks(np.arange(topn + 1), xticks_arr) plt.ylabel("Points / Season") plt.show() plt.errorbar(np.array(x), np.array(y) / 16, np.array(e_game), linestyle='None', marker='o', capsize=2, color="blue", ecolor="red") plt.title( f"Average Points Scored by the Top {topn} {position}s Per Game") plt.xlabel(f"{position} position at finish") xticks_arr = [str(num + 1) for num in range(topn)] xticks_arr.insert(0, "") plt.xticks(np.arange(topn + 1), xticks_arr) plt.ylabel("Points / Game") plt.show() return np.array(y), np.array(e_season)
def getListMaxNumIndex(num_list,topk=5): max_num_index=list(map(num_list.index, heapq.nlargest(topk,num_list))) return max_num_index
def _rank(self, ranking, n): """ return the first n sentences with highest ranking """ return nlargest(n, ranking, key=ranking.get)
def findKthLargestHeap(self, nums: List[int], k: int) -> int: return heapq.nlargest(k, nums)[-1]
def maximumproduct(nums): import heapq a, b = heapq.nlargest(3, nums), heapq.nsmallest(2, nums) return max(a[0] * a[1] * a[2], a[0] * b[0] * b[1])