def build_btree(speeches): """Build Binary Tree - Oranize Speeches by Date""" speech_tree = BinaryTree() #binary tree whose values are ID/Date tuples and is aranged by date # build a binary tree of file numbers arranged by (date,file_number) tuple for speech in speeches: date_id_key = speech.date, speech.speech_id #unique for each file, allows sort by date order speech_tree.insert(date_id_key, speech) return speech_tree # find the earliest and latest date in the folder min_speech_date = speech_tree.min_item() max_speech_date = speech_tree.max_item()
def build_btree(speeches): """Build Binary Tree - Oranize Speeches by Date""" speech_tree = BinaryTree( ) #binary tree whose values are ID/Date tuples and is aranged by date # build a binary tree of file numbers arranged by (date,file_number) tuple for speech in speeches: date_id_key = speech.date, speech.speech_id #unique for each file, allows sort by date order speech_tree.insert(date_id_key, speech) return speech_tree # find the earliest and latest date in the folder min_speech_date = speech_tree.min_item() max_speech_date = speech_tree.max_item()
def find_number_in_tree(file_name: str) -> str: tree = BinaryTree() with open(file_name) as f: file = f.readlines() result = [] for i in file[1::]: if i.rstrip('\n') not in tree: tree.insert(i.rstrip('\n'), 0) result.append('-') else: result.append('+') with open("bintrees.out", "w+") as f: for i in result: f.write(i + '\n')
def build_btree(): speech_tree = BinaryTree() #binary tree whose values are ID/Date tuples and is aranged by date ## build a binary tree of file numbers arranged by (date,file_number) tuple oswalkgen = os.walk(folder_path) next(oswalkgen) for root, subFolders, files in oswalkgen:#os.walk(folder_path): for f in files: import pdb; pdb.set_trace() file_name = root+'/'+f s = open(file_name,'r') speech = json.loads(s.read()) #speech is a key-value Dictionary #add ID to speech speech['id'] = getid(file_name) speech_id = speech['id'] speech_date = speech['date'] speech_tuple = speech_date, speech_id speech_tree.insert(speech_tuple,file_name) s.close() return speech_tree
def build_btree(): speech_tree = BinaryTree( ) #binary tree whose values are ID/Date tuples and is aranged by date ## build a binary tree of file numbers arranged by (date,file_number) tuple oswalkgen = os.walk(folder_path) next(oswalkgen) for root, subFolders, files in oswalkgen: #os.walk(folder_path): for f in files: import pdb pdb.set_trace() file_name = root + '/' + f s = open(file_name, 'r') speech = json.loads(s.read()) #speech is a key-value Dictionary #add ID to speech speech['id'] = getid(file_name) speech_id = speech['id'] speech_date = speech['date'] speech_tuple = speech_date, speech_id speech_tree.insert(speech_tuple, file_name) s.close() return speech_tree
class AppProxy(object): def __init__(self, queue_key=None): self.proxys = BinaryTree() self.proxy_queue_handle = None # self.proxy_queue_key = "scrapy:ip_proxy_queue" self.proxy_queue_key = queue_key # 代理IP 池的长度 self.proxy_queue_min_lenght = cfg.PROXY_MIN_QUEUE_LENGHT # 代理最多每次增加数量 self.proxy_increment_num = 20 self.sleep_time = 0.2 def _create_proxy(self, proxycls, ratio): if not isinstance(ratio, int): ratio = int(ratio) if isinstance(proxycls, Proxy): obj = proxycls.create(self.proxy_queue_handle, ratio, cfg.PROXY_SERVER_REQUEST_TIMEOUT) elif isinstance(proxycls, six.string_types): obj = AppProxy.load_object(proxycls).create( self.proxy_queue_handle, ratio, cfg.PROXY_SERVER_REQUEST_TIMEOUT) else: raise ValueError("Not a valid value(%s)" % str(proxycls)) last_val = ratio if self.proxys.count > 0: last_val = self.proxys.max_item()[0] + last_val self.proxys.insert(last_val, obj) def _get_proxy(self, need_num): # 根据需求数,调用不同的类的get方法,往代理池中增加相应数量的ip数量 max_weight = self.proxys.max_item()[0] random_weight = random.randint(1, max_weight) proxy = self.proxys.ceiling_item(random_weight)[1] num = int(min(proxy.request_max_num, need_num)) result = 0 if num > 0: result = proxy.get(num) log.debug("_on_proxy: (%s)ratio=%d|request_max_num=%d|result=%d" % (proxy.name, proxy.ratio, proxy.request_max_num, result)) return result def _on_procrssed(self): # 判断代理ip池中的长度 llen = self.proxy_queue_handle.len() # 和最小ip池数量做对比,求出差值, need_num = cfg.PROXY_MIN_QUEUE_LENGHT - llen result = 0 if need_num > 0: need_num = min(need_num, self.proxy_increment_num) result = self._get_proxy(need_num) log.debug("_on_procrssed -->: llen=%d|need_num=%d|result=%d" % (llen, need_num, result)) @staticmethod def load_object(path): """ Load an object given its absolute object path, and return it. :param path: ie, 'proxy.proxy.Proxy' :return: """ try: dot = path.rindex('.') except ValueError: raise ValueError("Error loading object '%s': not a full path" % path) module, name = path[:dot], path[dot + 1:] mod = importlib.import_module(module) try: obj = getattr(mod, name) except AttributeError: raise NameError( "Module '%s' doesn't define any object named '%s'" % (module, name)) return obj def start_proxys(self, redis_cfg, proxy_class): redis_pool.init_redis(redis_cfg) self.proxy_queue_handle = RedisProxyQueue(self.proxy_queue_key) self.proxy_queue_handle.clear() for proxycls in proxy_class: # 返回一个不同代理IP供应商类的实例 self._create_proxy(proxycls, proxy_class[proxycls]) log.info("start_proxy: %s --> %d" % (str(proxycls), proxy_class[proxycls])) def run(self): log.info('###############################################') log.info('Now begin.......') while 1: try: self._on_procrssed() time.sleep(self.sleep_time) except KeyboardInterrupt: pass except Exception as e: log.error('run error:', str(e)) log.error("run traceback:" + traceback.format_exc())