def __init__(self, *args, **kwargs): list.__init__(self, *args) heapq.heapify(self) if 'maxsize' in kwargs: self.maxsize = kwargs['maxsize'] else: self.maxsize = 10
def _update_unavailable_forwarders(self, forwarder, action): """ Update forwarder status. """ with self._forwarders_lock.reader_lock: if forwarder in { forwarder_load.forwarder for forwarder_load in self._available_forwarderloads }: for index, entry in enumerate( deepcopy(self._available_forwarderloads)): if entry.forwarder == forwarder: if action == self.DEPLOY_INPUT: entry.load -= 1 self._unavailable_forwarderloads.append(entry) del self._available_forwarderloads[index] heapq.heapify(self._available_forwarderloads) break else: for index, entry in enumerate( self._unavailable_forwarderloads): if entry.forwarder == forwarder: if action == self.DEPLOY_INPUT: entry.load -= 1 break
def __init__(self, items=[]): """ Create a new PriorityQueueSet. items: An initial item list - it can be unsorted and non-unique. The data structure will be created in O(N). """ self.set = dict((item, True) for item in items) self.heap = self.set.keys() heapq.heapify(self.heap)
def set_proxy_status(self, url, proxy, status): """.. :py:method:: Now, crawler only set status after proxy failed. remove this proxy from self._pool, self._table[domain], and priority queue. :param url: get the domain :param proxy: :param status: fail or success """ domain = urlparse.urlparse(url).netloc proxies_table = self._table[domain] now = time.time() if status == self.FAILED: if proxy in proxies_table: last_time, count = proxies_table[proxy] _count = self._count_rule('set', count) if _count >= self.FAIL_THRESHOLD: proxies_table.pop(proxy) # 1. this proxy not available, remove from pool # 2. this proxy avaiable for other sites, not remove self._pool.remove(proxy) if 'priority' in proxies_table: proxies_table['priority'].remove( [last_time, count, proxy]) heapq.heapify(proxies_table['priority']) else: proxies_table[proxy][1] = _count idx = proxies_table['priority'].index( [last_time, count, proxy]) proxies_table['priority'][idx][1] = _count else: # not execute here now proxies_table[proxy] = (now, 1) elif status == self.SUCCESS: pass
def _handle_forwarders(self): """ Handle settings of forwarders. """ forwarder_schema = self._dispatch_schema_manager.get_forwarder_schema() forwarders = { forwarder_name: forwarder_setting for forwarder_name, forwarder_setting in self._settings[forwarder_schema].iteritems() } # Update available forwarders and forwarders dispatch map with self._forwarders_lock.writer_lock: self._available_forwarderloads = [ ForwarderLoad( forwarder, self._dispatch_snapshot_manager.get_forwarder_load( forwarder)) for forwarder, forwarder_setting in forwarders.iteritems() if not self._dispatch_schema_manager.forwarder_is_disabled( forwarder_setting) ] heapq.heapify(self._available_forwarderloads) self._unavailable_forwarderloads = [] with self._forwarders_snapshot_lock.writer_lock: self._forwarders_snapshot = \ self._get_forwarders_snapshot_callback() # Forwarders new to reset forwarders_reset_new = { forwarder_name: forwarder_setting for forwarder_name, forwarder_setting in forwarders.iteritems() if forwarder_name not in self._forwarders_snapshot } # Forwarders exist to reset forwarders_reset_exist = { forwarder_name: forwarder_setting for forwarder_name, forwarder_setting in forwarders.iteritems() if forwarder_name in self._forwarders_snapshot and self._dispatch_schema_manager.forwarder_is_disabled( forwarder_setting) } # Forwarders delete to reset forwarders_reset_delete = { forwarder_name: forwarder_setting for forwarder_name, forwarder_setting in self._forwarders_snapshot.iteritems() if forwarder_name not in forwarders } # Update forwarder snapshot for forwarder_name, forwarder_setting in forwarders.iteritems(): if forwarder_name in self._forwarders_snapshot: self._forwarders_snapshot[forwarder_name] = \ deepcopy(forwarder_setting) try: self._update_forwarders_snapshot_callback( self._forwarders_snapshot) except Exception as e: log.logger.warn( "message=\"Update forwarders snapshot failed, " "will try to update forwarders snapshot next " "time\" " "detail_info=\"%s\"", traceback.format_exc(e)) handle_futures = [] for forwarder_name, forwarder_setting in \ forwarders_reset_new.iteritems(): handle_futures.append( self._threadpool_executor.submit(self._reset_forwarder, forwarder_name, forwarder_setting, self.FORWARDER_NEW)) for forwarder_name, forwarder_setting in \ forwarders_reset_exist.iteritems(): handle_futures.append( self._threadpool_executor.submit(self._reset_forwarder, forwarder_name, forwarder_setting, self.FORWARDER_EXIST)) for forwarder_name, forwarder_setting in \ forwarders_reset_delete.iteritems(): handle_futures.append( self._threadpool_executor.submit(self._reset_forwarder, forwarder_name, forwarder_setting, self.FORWARDER_DELETE)) # Wait until all tasks are done futures.wait(handle_futures, return_when=futures.ALL_COMPLETED)
def findKthLargest(self, nums, k): from Queue import heapq heapq.heapify(nums) sorted_heap = [heapq.heappop(nums) for _ in xrange(len(nums))] return sorted_heap[-k]
def search(self, query, return_length=100, passage_len=50, return_urls_only=False): ''' Performs search on loaded data. Returns list of sorted by rank: * tuples (url, rank) if return_urls_only == False * url if return_urls_only == True ''' query = query.strip() words = filter(lambda x: x != '', query.split(" ")) result = None if len(words) == 0: return [] word_index = [None] * len(words) for z, word in enumerate(words): word = self.norm(word.decode('utf-8').strip()) if word in self.dictionary: self.index.seek(self.dictionary[word], 0) compressed = self.index.readline().strip() decompressed = None if self.encoding == VARBYTE: decompressed = decode_varbyte(base64.b64decode(compressed)) elif self.encoding == SIMPLE9: decompressed = decode_simple9(base64.b64decode(compressed)) decompressed, word_index[z] = from_flat(decompressed) if result == None: result = decompressed else: result = join(result, decompressed) k1 = 2 b = 0.75 if result == None or len(result) == 0: return [] #Now we have a list of candidates. We apply BM25 to leave only return_length of them avg_len = 0. j = 0 while word_index[j] == None: j += 1 for i in xrange(len(result)): if result[i] in word_index[j]: avg_len += word_index[j][result[i]][0] avg_len /= len(result) BM25 = [0] * len(result) for j in xrange(len(words)): if word_index[j] != None: idf = log(float(self.N) / len(word_index[j])) for i in xrange(len(result)): if result[i] in word_index[j]: tf = float(len(word_index[j][result[i]][1])) / word_index[j][result[i]][0] BM25[i] += tf * idf / (tf + k1 * (b + word_index[j][result[i]][0] / avg_len * (1 - b))) if len(result) > return_length: tpr = [(x, y) for x, y in zip(BM25, result)] heap = tpr[:return_length] heapq.heapify(heap) for rank, ind in tpr[return_length:]: if heapq.nsmallest(1, heap)[0][0] < rank: heapq.heappop(heap) heapq.heappush(heap, (rank, ind)) result = [ind for rank, ind in heap] #Now we have a shortened list of candidates. We apply passage algorithm to leave top maxPASSpass scores = [0] * len(result) for i in xrange(len(result)): passage = [] for j in xrange(len(words)): if word_index[j] != None and result[i] in word_index[j]: passage.extend([(x, j) for x in word_index[j][result[i]][1]]) passage.sort() l = 0 r = 0 features = [0] * 5 for l in xrange(len(passage)): for r in xrange(l, len(passage)): if passage[r][0] - passage[l][0] + 1 > passage_len: continue passage_w = [x[1] for x in passage[l:r+1]] features[0] = len(set([x[1] for x in passage[l:r+1]])) / float(len(words)) features[1] = 1 - float(passage[l][0]) / word_index[passage[l][1]][result[i]][0] features[2] = 1 - float(r - l + 1) / (passage[r][0] - passage[l][0] + 1) features[3] = 0 for j in xrange(len(words)): if word_index[j] != None: idf = log(float(self.N) / len(word_index[j])) / log(self.N) tf = float(passage_w.count(j)) / (passage[r][0] - passage[l][0] + 1) features[3] += tf * idf features[4] = 0 for j in xrange(len(passage_w)-1): for k in xrange(j + 1, len(passage_w)): if passage_w[j] > passage_w[k]: features[4] += 1 if len(passage_w) != 1: features[4] /= float(len(passage_w) * (len(passage_w) - 1) / 2) score = reduce(lambda x,y: x + y, features) if score > scores[i]: scores[i] = score final_result = [] for score, url_id in sorted(zip(scores, result), reverse=True): final_result.append((url_id, self.urls[url_id], score)) if return_urls_only: final_result = [x[:-1] for x in final_result] return final_result