def load_spot_once(self, pn=1, city_id=10186): ''' load spot once ''' data = { 'sAct': 'KMdd_StructWebAjax|GetPoisByTag', 'iMddid': city_id, 'iTagId': 0, 'iPage': pn, } data = self.load_sn(data) print(data) req = get_request_proxy(self.AJAX_ROUTER_URL, 11, data=data) if req is None or not 'data' in req or not 'list' in req['data']: if can_retry('{}{}{}'.format(self.AJAX_ROUTER_URL, city_id, pn)): self.load_spot_once(pn, city_id) return spot_list = req['data']['list'] spot_pn = req['data']['page'] spot_tmp = re.findall('<h3>.*?(.*?)</h3>', spot_list) try: total_pn = int(re.findall('共<span>(.*?)</span>', spot_pn)[0]) except Exception as e: total_pn = 1 echo(0, 'City id:', city_id, 'Page:', pn, spot_pn, e) if city_id not in self.spot_result: self.spot_result[city_id] = spot_tmp else: self.spot_result[city_id] += spot_tmp self.spot_pn[city_id] = total_pn
def load_gather(self): """ load gather proxy pool text If failured, you should reactive the cookie. """ headers = { 'pragma': 'no-cache', 'cache-control': 'no-cache', 'Host': 'www.gatherproxy.com', 'Origin': 'http://www.gatherproxy.com', 'Referer': 'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent', 'Cookie': '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", } url = 'http://www.gatherproxy.com/subscribe/infos' sid_url_req = requests.get(url, headers=headers, verify=False) sid_url_html = BeautifulSoup(sid_url_req.text, 'html.parser') sid_url = sid_url_html.find_all('div', class_='wrapper')[1].find_all('a')[0]['href'] if len(sid_url.split('sid=')) < 2: echo(0, 'cookie error') self.get_cookie() self.load_gather() return sid = sid_url.split('sid=')[1] sid_url = 'http://www.gatherproxy.com' + sid_url data = {'ID':sid , 'C': '', 'P': '', 'T': '', 'U': '0'} gatherproxy = requests.post(sid_url, headers=headers, data=data,verify=False) with codecs.open(data_dir + 'gatherproxy', 'w', encoding='utf-8') as f: f.write(gatherproxy.text)
def js_compile_sn(self, prepare_map): ''' js compile sn ''' wait_js = '<script>' + self.result_js + '</script>' sn = self.js_compile.call('analysis_js', wait_js, self.slat, prepare_map) echo(2, '_sn', sn) return sn
def prepare_js(self): ''' prepare js ''' pre_text = basic_req(self.JD_URL, 3) INDEX_JS_URL = re.findall(r'src=.*index\.js.*" t', pre_text)[0].split('"')[1] origin_js = basic_req(INDEX_JS_URL, 3) ''' decoder js ''' decode_js = codecs.unicode_escape_decode(origin_js)[0] ''' params replace ''' replace_list_str = decode_js.split(';')[2] empty_index = replace_list_str.index(' ') + 1 begin_index = replace_list_str.index('=[') + 2 end_index = replace_list_str.index(']') replace_list = replace_list_str[begin_index:end_index].split(',') rp = replace_list_str[empty_index:begin_index - 2] for ii, jj in enumerate(replace_list): decode_js = decode_js.replace('{}[{}]'.format(rp, ii), jj) self.slat = replace_list[46].replace('"', '') echo(2, 'salt', self.slat) ''' load to local ''' with open(decoder_js_path, 'w') as f: f.write(';\n'.join(decode_js.split(';'))) ''' del function about ajax ''' del_str = re.findall(r'_.{6,10}\["ajaxPrefilter.*\)\}\}\)', decode_js) del_begin_index = decode_js.index(del_str[0]) result_js = decode_js[:del_begin_index] + \ decode_js[del_begin_index + len(del_str[0]):] result_js = decode_js[:del_begin_index] + \ decode_js[del_begin_index + len(del_str[0]):] self.result_js = result_js self.js_compile = execjs.compile(open(hotel_js_path).read()) echo(1, 'Load hotel index js success!!!')
def dbcanuseproxy(self): """ test db have or not this data """ results = self.selectproxy([ii[0] for ii in self.canuseip.values()]) ss_len = len([1 for ii in self.canuseip.values() if ii[1] > 1]) echo(2, "SS proxies %d"%ss_len) insertlist = [] updatelist = [] ipmap = {} if results != False: for ip_info in results: ipmap[ip_info[1]] = [ip_info[0], ip_info[2]] for ip_now in self.canuseip.values(): http_type = ip_now[1] ip_now = ip_now[0] if ip_now in ipmap: if ipmap[ip_now][1]: updatelist.append( (ipmap[ip_now][0], ip_now, http_type, 0)) else: insertlist.append((ip_now, http_type)) if len(insertlist): self.insertproxy(insertlist) if len(updatelist): self.updateproxy(updatelist, 0) else: pass self.canuseip = {}
def initproxy(self): """ init proxy list """ results = self.Db.select_db(self.select_list) self.proxylist = [] self.proxylists = [] self.proxylist_ss = [] self.proxylists_ss = [] if results != 0: for index in results: if index[1] == 1: self.proxylists.append(index[0]) elif index[1] == 2: self.proxylist.append(index[0]) self.proxylist_ss.append(index[0]) elif index[1] == 3: self.proxylists.append(index[0]) self.proxylists_ss.append(index[0]) else: self.proxylist.append(index[0]) echo(2, len(self.proxylist), ' http proxy can use.') echo(2, len(self.proxylists), ' https proxy can use.') echo(2, len(self.proxylist_ss), ' ss http proxy can use.') echo(2, len(self.proxylists_ss), ' ss https proxy can use.') else: echo(0, 'Please check db configure!!! The proxy pool cant use!!!>>>')
def v(eval_func,type): u.echo(f,f'({type}) eval:',end='',flush=True) t_SAC, b_SAC = eval_func(evidence,tac_posteriors) u.echo(f,f' {t_SAC:.2f} sec' f'\n {1000*t_SAC/bsize:.0f} ms per example, used batch size {b_SAC}' f'\n {t_SAC/t_AC:.2f} {type}/ac ') return t_SAC, b_SAC
def parse_detail(self): ''' parse hotel detail ''' version = begin_time() text = self.get_hotel_detail() html = BeautifulSoup(text['html'], 'html.parser') trs = html.findAll('tr')[2:] hotel_detail = [] for tr in trs: room_name = re.findall('baseroomname="(.*?)"', str(tr)) if not len(room_name): room_name = re.findall('rel="nofollow">\n(.*?)\n', str(tr)) room_name = room_name[0].strip() if len( room_name) else hotel_detail[-1][0] price = re.findall(r'</dfn>(\d{4,5}?)</span>', str(tr)) if not len(price): continue else: price = price[0] price_type = re.findall('room_type_name">(.*?)</span>', str(tr))[0] if 'em' in price_type: price_type = ','.join([ *re.findall('(.*?)<em', price_type), *re.findall('((.*?))', price_type) ]) hotel_detail.append([room_name, price_type, price]) output_dir = '{}hotelDetail.txt'.format(data_dir) with open(output_dir, 'w') as f: f.write('\n'.join([','.join(ii) for ii in hotel_detail])) echo( 1, 'Load {} price\nOutput path: {}\nSpend time: {:.2f}s'.format( len(hotel_detail), output_dir, end_time(version, 0))) return hotel_detail
def get_other_proxies(self, url): ''' get other proxies ''' text = self.request_text(url) pages = re.findall(r'<h3[\s\S]*?<a.*?(http.*?\.html).*?</a>', '' if text is None else text) if not len(pages ): echo(0, 'Please do not frequently request {}!!!'.format(url)) else: proxies = [re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', self.request_text(ii)) for ii in pages] self.waitjudge = [*self.waitjudge, *sum(proxies, [])]
def insertproxy(self, insertlist): """ insert data to db """ results = self.Db.insert_db(self.insert_sql % str(insertlist)[1:-1]) if results: echo(2, 'Insert ' + str(len(insertlist)) + ' items Success!') else: pass
def load_proxies_test(self): ''' load mode & test proxies ''' start = time.time() self.load_proxies_list() proxies_len = len(self.waitjudge) self.threadjude() canuse_len = len(self.canuse_proxies) echo(1, '\nTotal Proxies num: {}\nCan use num: {}\nTime spend: {:.2f}s\n'.format(proxies_len, canuse_len,time.time() - start)) with open('{}canuse_proxies.txt'.format(data_dir), 'w') as f: f.write('\n'.join(self.canuse_proxies))
def get_request_proxy(self, url:str, types:int, data=None, test_func=None, header=None): """ use proxy to send requests, and record the proxy cann't use @types S0XY: X=0.->get; =1.->post; Y=0.->html; =1.->json; =2.->basic S=0.->basic ;=1.->ss support failured retry && failured auto record """ httptype = url[4] == 's' ss_type = types // 1000 types %= 1000 if ss_type: proxylist = self.proxylists_ss if httptype else self.proxylist_ss else: proxylist = self.proxylists if httptype else self.proxylist if not len(proxylist): if self.Db.db: echo(0, 'Proxy pool empty!!! Please check the db conn & db dataset!!!') proxies = {} else: index = random.randint(0, len(proxylist) - 1) proxies_url = proxylist[index] proxies = {type_map[httptype]: proxies_url} try: result = basic_req(url, types, proxies, data, header) if not test_func is None: if not test_func(result): if self.check_retry(url): self.get_request_proxy( url, types + 1000 * ss_type, data, test_func) else: self.failuredtime[url] = 0 return else: return result else: return result except: self.cannotuseip[random.randint(0, MAXN)] = proxies_url if proxies_url in proxylist: proxylist.remove(proxylist.index(proxies_url)) if not len(self.cannotuseip.keys()) % 10: self.cleancannotuse() if self.check_retry(url): self.get_request_proxy(url, types + 1000 * ss_type, data, test_func) else: return
def updateproxy(self, updatelist, types): """ update data to db """ results = self.Db.update_db(self.replace_ip % str(updatelist)[1:-1]) typemap = {0: 'can use ', 1: 'can not use '} if results: echo(2, 'Update', typemap[types],str(len(updatelist)),' items Success!') else: pass
def request_text(self, url): ''' requests text ''' req = basic_req(url, 2) if req is None: echo(0, url) if can_retry(url): self.request_text(url) else: return '' else: echo(1, url) return req.text
def get_hotel_detail(self): ''' get hotel detail ''' params = { **self.generate_other_params(), 'callback': self.generate_callback(16), 'eleven': self.generate_eleven(), '_': int(time.time() * 1000) } params_list = [ '{}={}'.format(ii, (jj if not jj is None else '')) for ii, jj in params.items() ] url = '{}?{}'.format(HOTEL_ROOMLIST_FOR_DETAIL_URL, '&'.join(params_list)) echo(2, 'XHR url', url) text = basic_req(url, 1) return text
def sixsixip(self, area, page): """ 66ip proxy http://www.66ip.cn/areaindex_{area}/{page}.html """ version = begin_time() threadings = [] for index in range(1, area + 1): for pageindex in range(1, page + 1): echo(2,str(index) + ' ' + str(pageindex)) work = threading.Thread( target=self.sixsixthread, args=(index, pageindex)) threadings.append(work) for work in threadings: work.start() for work in threadings: work.join() self.threadjude() end_time(version)
def load_proxies_list(self, types=2): ''' load proxies ''' SITES = ['http://www.proxyserverlist24.top/', 'http://www.live-socks.net/'] spider_pool = [] self.waitjudge = [] for site in SITES: self.get_other_proxies(site) if os.path.exists('{}gatherproxy'.format(data_dir)): self.gatherproxy(3) waitjudge = list(set(self.waitjudge)) waitjudge_http = ['http://' + ii for ii in waitjudge] waitjudge_https = ['https://' + ii for ii in waitjudge] if not types: self.waitjudge = waitjudge_http elif types == 1: self.waitjudge = waitjudge_https else: self.waitjudge = (waitjudge_http + waitjudge_https) echo(1, '-_-_-_-_-_-_-', len(waitjudge), 'Proxies wait to judge -_-_-_-_-_-_-')
def judgeurl(self, urls, index, times, ss_test=False): """ use /api/playlist to judge http; use /discover/playlist judge https 1. don't timeout = 5 2. response.result.tracks.size() != 1 """ http_type = urls[4] == 's' proxies = {type_map[http_type]: urls} test_url = type_map[http_type] + '://music.163.com/api/playlist/detail?id=432853362' ss_url = 'https://www.google.com/?gws_rd=ssl' try: data = basic_req(test_url, 1, proxies) result = data['result'] tracks = result['tracks'] if len(tracks) == 56: if times < 0: self.judgeurl(urls, index, times + 1) else: echo(1, urls, proxies, 'Proxies can use.') self.canuse_proxies.append(urls) self.canuseip[index] = [urls, int(http_type)] if ss_test: data = basic_req(ss_url, 0) if len(str(data)) > 5000: self.canuseip[index] = [urls, int(http_type) + 2] else: echo(0, urls, proxies, 'Tracks len error ^--<^>--^ ') self.cannotuseip[index] = urls except: echo(0, urls, proxies, 'return error [][][][][][]') if not index in self.canuseip: self.cannotuseip[index] = urls pass
def get_cookie(self): """ make cookie login PS: Though cookie expired time is more than 1 year, but It will be break when the connect close. So you need reactive the cookie by this function. """ headers = { 'pragma': 'no-cache', 'cache-control': 'no-cache', 'Host': 'www.gatherproxy.com', 'Origin': 'http://www.gatherproxy.com', 'Referer': 'http://www.gatherproxy.com/proxylist/anonymity/?t=Transparent', 'Cookie': '_lang=en-US; _ga=GA1.2.1084455496.1548351129; _gid=GA1.2.1515017701.1552361687; ASP.NET_SessionId=ckin3pzyqyoyt3zg54zrtrct; _gat=1; arp_scroll_position=57', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", } login_url = 'http://www.gatherproxy.com/subscribe/login' cookie_html = basic_req(login_url, 0,header=headers) verify_text = cookie_html.find_all('div', class_='label')[2].span.text verify_list = verify_text.replace('= ','').strip().split() num_map = {'Zero': 0,'One': 1,'Two': 2, 'Three':3,'Four':4,'Fine':5,'Six':6,'Seven':7,'Eight': 8, 'Nine':9, 'Ten': 10} verify_num = [verify_list[0], verify_list[2]] for index, num in enumerate(verify_num): if num.isdigit(): verify_num[index] = int(num) elif num in num_map: verify_num[index] = num_map[num] else: echo(0, 'Error', index) # return False verify_code = 0 error = True operation = verify_list[1] if operation == '+' or operation == 'plus' or operation == 'add' or operation == 'multiplied': verify_code = verify_num[0] + verify_num[1] error = False if operation == '-' or operation == 'minus': verify_code = verify_num[0] - verify_num[1] error = False if operation == 'X' or operation == 'multiplication': verify_code = verify_num[0] * verify_num[1] error = False if error: echo(0, 'Error', operation) if not os.path.exists('%spassage'%data_dir): echo(0, 'gather passage not exist!!!') return with codecs.open('%spassage'%data_dir, 'r', encoding='utf-8') as f: passage = [index[:-1] for index in f.readlines()] data = {'Username': passage[0], 'Password': passage[1], 'Captcha': str(verify_code)} time.sleep(2.163) r = requests.session() r.cookies = cj.LWPCookieJar() login_req = r.post(login_url, headers=headers, data=data, verify=False)
def xiciproxy(self, page): """ xici proxy http://www.xicidaili.com/nn/{page} The first proxy I use, but now it can not use it mostly. """ if not str(page).isdigit(): echo(0, "Please input num!") return [] version = begin_time() url = 'http://www.xicidaili.com/nn/%d' for index in range(1, page + 1): html = basic_req(url%(index), 0) tem = html.find_all('tr') for index in range(1, len(tem)): tds = tem[index].find_all('td') ip = tds[5].text.lower() self.waitjudge.append( ip + '://' + tds[1].text + ':' + tds[2].text) self.threadjude() end_time(version)
def gatherproxy(self, types): """ :100: very nice website first of all you should download proxy ip txt from: http://www.gatherproxy.com/zh/proxylist/country/?c=China """ if not os.path.exists('{}gatherproxy'.format(data_dir)): echo(0, 'Gather file not exist!!!') return with codecs.open('{}gatherproxy'.format(data_dir), 'r', encoding='utf-8') as f: file_d = [ii.strip() for ii in f.readlines()] waitjudge_http = ['http://' + ii for ii in file_d] waitjudge_https = ['https://' + ii for ii in file_d] if not types: self.waitjudge += waitjudge_http elif types ==1: self.waitjudge += waitjudge_https elif types == 2: self.waitjudge += (waitjudge_http + waitjudge_https) else: self.waitjudge += file_d echo(2, 'load gather over!')
def load_spot(self, batch_size=50): ''' load spot ''' version = begin_time() self.load_city_list() # self.city_list = [10186] city_threading = [ threading.Thread(target=self.load_spot_once, args=( 1, ii, )) for ii in self.city_list ] shuffle_batch_run_thread(city_threading, 150) spot_continue = [] for ii, jj in self.spot_pn.items(): spot_continue += [ threading.Thread(target=self.load_spot_once, args=( pn, ii, )) for pn in range(2, jj + 1) ] shuffle_batch_run_thread(spot_continue, 150) output = [ '{},{}'.format(self.id2map[ii], ','.join(jj)) for ii, jj in self.spot_result.items() ] output_path = '{}spot.txt'.format(data_dir) with open(output_path, 'w') as f: f.write('\n'.join(output)) city_num = len(self.city_list) spot_num = sum([len(ii) for ii in self.spot_result.values()]) echo( 1, 'City num: {}\nSpot num: {}\nOutput path: {}\nSpend time: {:.2f}s\n' .format(city_num, spot_num, output_path, end_time(version, 0)))
def eval_all(sizes,output,testing): circuit_type = 'TAC' if testing else 'AC' fname = paths.exp / u.time_stamp(f'eval_rect_{output}_{testing}','txt') f = open(fname,'w+') u.echo(f,f'\n===Rectangle: evaluating {circuit_type} for {output}===') u.echo(f,'output logged into logs/exp/') start_time = time.time() for size in sizes: eval(f,size,output,testing) all_time = time.time() - start_time u.echo(f,f'\nTotal Time: {all_time:.3f} sec') f.close()
def test_tw_reduction(ssize): saved = u.verbose u.verbose = False # vcount: number of network vars # scount: max number of values per var # pcount: max number of parents per var counts = (75,2,4), (100,3,5) # vcount, scount, pcount fperct = (1/4,1/2,2/3,4/5) # percentage of functional vars fname = f'TW{ssize}_C{counts}_P{fperct}' fname = paths.exp / u.time_stamp(fname,'txt') f = open(fname,'w+') u.echo(f,'\n===Reduction in TreeWidth') u.echo(f,f'sample size {ssize}') u.echo(f,'output logged into logs/exp/') start_time = time.perf_counter() for vcount, scount, pcount in counts: back = vcount - 1 for functional_fraction in fperct: fcount = int(vcount * functional_fraction) w1_sample = [] w2_sample = [] for _ in range(ssize): bn, _, _ = rbn.get(vcount,scount,pcount,fcount,back,testing=False) bn1 = bn.copy_for_inference() bn2, _, (w1,w2) = decouple.get(bn1,[],False,'minfill',None) w1_sample.append(w1) w2_sample.append(w2) reduction = [w1-w2 for w1,w2 in zip(w1_sample,w2_sample)] rd_mean, rd_stdev = s.mean(reduction), s.stdev(reduction) w1_mean, w1_stdev = s.mean(w1_sample), s.stdev(w1_sample) w2_mean, w2_stdev = s.mean(w2_sample), s.stdev(w2_sample) u.echo(f,f'\n== vcount {vcount}, scount {scount}, pcount {pcount}, fcount {functional_fraction:.2f}, ') u.echo(f,f'before mean {w1_mean:.1f} stdev {w1_stdev:.1f}') u.echo(f,f'after mean {w2_mean:.1f} stdev {w2_stdev:.1f}') u.echo(f,f'reduce mean {rd_mean:.1f} stdev {rd_stdev:.1f}') all_time = time.perf_counter() - start_time u.echo(f,f'\n===Total Time: {all_time:.3f} sec') f.close() v.verbose = saved
def generate_eleven(self): ################################################################ # # [generate eleven] version 19.4.21(Test ✔️) write by gunjianpan # # 1. random generate 15 bit param `callback`; # 2. use callback request OCEANBALL -> get origin js; # 3. eval once -> (match array, and then chr() it) -> decoder js; # 4. replace document and windows(you also can use execjs & jsdom); # 5. warning you should replace `this` to some params, # Otherwise, you will get `老板给小三买了包, 却没有给你钱买房` # 6. finsh, return, and joint params; # ################################################################ callback = self.generate_callback(15) now_time = int(time.time() * 1000) url = '{}?callback={}&_={}'.format(OCEANBALL_URL, callback, now_time) referer_url = HOTEL_DETAIL_URL % self.default_hotel_id changeHeaders({'Referer': referer_url}) oceanball_js = basic_req(url, 3) array = re.findall(r'\(\[(.*)\],', oceanball_js)[0].split(',') array = [int(ii) for ii in array] offset = int(re.findall(r'item-(\d*?)\)', oceanball_js)[0]) ''' String.fromCharCode ''' oe = ''.join([chr(ii - offset) for ii in array]) ''' replace window[callback] callback function ''' replace_str = re.findall(r'{}\(new.*\)\);'.format(callback), oe)[0] eleven_params = re.findall( r'{}\(new.*\+ (.*?) \+.*\)\);'.format(callback), oe)[0] replaced_str = 'return {};'.format(eleven_params) oe = oe.replace(replace_str, replaced_str) oe = oe.replace('\'', '"').replace('\r', '') oe = oe.replace(';!', 'let aaa = ', 1) replace = ''' function(){let href='https://hotels.ctrip.com/hotel/4889292.html'; a={'documentElement': {'attributes':{}}}; b={}; function c(){}; userAgent ='Chrome/73.0.3682.0'; geolocation = 0; ''' ''' replace document & windown & navigator ''' oe = oe.replace('document.body.innerHTML.length', '888').replace('document.body.innerHTML', '""') oe = oe.replace('document.createElement("div")', '{}') oe = oe.replace('window.HTMLSpanElement', 'c').replace('document.createElement("span")', '1') oe = oe.replace('window.location.href', 'href').replace('location.href', 'href') oe = oe.replace('navigator.', '') oe = oe.replace('new Image().', '') oe = oe.replace('document.all', '0').replace('document.referrer', '""') oe = oe.replace('this || ', '') oe = oe.replace('window["document"]', 'a') oe = oe.replace('document', 'a').replace('window', 'b') oe = oe.replace('function(){', replace, 1) ''' eval script ''' eleven = js2py.eval_js(oe) echo(1, 'eleven', eleven) return eleven
def __posterior_time(f,bn,inputs,output,bsize,min_ac,max_ac,counter): s_time = time.perf_counter() AC = tac.TAC(bn,inputs,output) t = time.perf_counter()-s_time if AC.size < min_ac*1000000 or AC.size > max_ac*1000000: return None u.echo(f,f'\n== {counter} ==\nTensor AC:',end='') u.echo(f,f' {t:.1f} sec') u.echo(f,f' size {AC.size:,}, max binary rank {AC.binary_rank:0.1f}') # get evidence cards = tuple(bn.node(input).card for input in inputs) evidence = data.evd_random(bsize,cards) # evaluate AC as tf graph with batch u.echo(f,f'(tf full) eval:',end='',flush=True) tac_posteriors, t_AC, b_AC = AC.evaluate(evidence,report_time=True) u.echo(f,f' {t_AC:.2f} sec' f'\n {1000*t_AC/bsize:.0f} ms per example, used batch size {b_AC}' f'\n {1000*t_AC/bsize/(AC.size/1000000):.0f} ms per 1M nodes (one example)') # check classical AC and numpy AC_size = AC.size AC_brank = AC.binary_rank opsgrapy = AC.ops_graph del AC # no longer needed u.echo(f,'\nScalar AC:',end='') s_time = time.perf_counter() SAC = verify.AC.ScalarAC(opsgrapy) t = time.perf_counter()-s_time u.echo(f,f' {t:.1f} sec') u.echo(f,f' size {SAC.size:,}') u.echo(f,f' {SAC.size/AC_size:.2f} scalar ac/tensor ac') def v(eval_func,type): u.echo(f,f'({type}) eval:',end='',flush=True) t_SAC, b_SAC = eval_func(evidence,tac_posteriors) u.echo(f,f' {t_SAC:.2f} sec' f'\n {1000*t_SAC/bsize:.0f} ms per example, used batch size {b_SAC}' f'\n {t_SAC/t_AC:.2f} {type}/ac ') return t_SAC, b_SAC t_numpy, b_numpy = v(SAC.verify_numpy,'numpy batch') # t_tf, b_tf = v(SAC.verify_tf,'tf batch') t_tf, b_tf = 0, 0 # t_array, b_array = v(SAC.verify_array,'array') t_array, b_array = 0, 0 return (AC_size, AC_brank, SAC.size, t_AC, t_numpy, t_tf, t_array, b_AC, b_numpy, b_tf, b_array)
def test_eval_time(ssize,bsize,min_ac,max_ac,vc,sc,pc): saved = u.verbose u.verbose = False fcount = vc // 2 # number of vars with functional cpt back = vc - 1 fname = (f'RBN_S{ssize}_B{bsize}_' f'C{min_ac}_{max_ac}_BN_' f'{vc}_{sc}_{pc}_{fcount}_{back}') fname = paths.exp / u.time_stamp(fname,'txt') f = open(fname,'w+') u.echo(f,f'\n===Evaluation time for random bayesian networks===\n') u.echo(f,f'{vc} vars, {sc} values, {pc} parents, ' f'{fcount} functional vars (no roots), {back} back' f'\n{ssize} circuits, ' f'size {min_ac}-{max_ac}M' f'\n{bsize} examples') u.echo(f,'output logged into logs/exp/') start_time = time.perf_counter() # stats s_AC, r_AC, s_SAC = [], [], [] t_AC, t_numpy, t_tf, t_array = [], [], [], [] b_AC, b_numpy, b_tf, b_array = [], [], [], [] def process(result): s, r, s2, tac, tnumpy, ttf, tarray, bac, bnumpy, btf, barray = result s_AC.append(s) r_AC.append(r) s_SAC.append(s2) t_AC.append(tac) t_numpy.append(tnumpy) t_tf.append(ttf) t_array.append(tarray) b_AC.append(bac) b_numpy.append(bnumpy) b_tf.append(btf) b_array.append(barray) success = 0 while success < ssize: bn, inputs, outputs = rbn.get(vc,sc,pc,fcount,back,testing=False) i = np.random.choice(inputs) o = np.random.choice(outputs) result = __posterior_time(f,bn,inputs,o,bsize,min_ac,max_ac,success) # causal if result: success += 1 process(result) result = __posterior_time(f,bn,outputs,i,bsize,min_ac,max_ac,success) # evidential if result: success += 1 process(result) assert len(s_AC) == ssize # summary stats # eval time for ac per one million nodes and one example ac_per_mill = [1000*t/bsize//(s/1000000) for t,s in zip(t_AC,s_AC)] # size of largest tensor in ac (2** max binary rank) ac_max_rank = r_AC # comparing tensor and scalar ac size sac_ac = [s1/s2 for s1,s2 in zip(s_SAC,s_AC)] # comparing ac eval time with others numpy_ac = [t1/t2 for t1,t2 in zip(t_numpy,t_AC)] tf_ac = [t1/t2 for t1,t2 in zip(t_tf,t_AC)] array_ac = [t1/t2 for t1,t2 in zip(t_array,t_AC)] u.echo(f,f'\n==\nsummary stats ({ssize} circuits, {bsize} examples, ' f'size {min_ac}-{max_ac}M)') u.echo(f,f' ac size: mean {int(s.mean(s_AC)):,}, stdev {int(s.stdev(s_AC)):,}, ' f'min {min(s_AC):,}, max {max(s_AC):,}') u.echo(f,f' ac brank: mean {s.mean(ac_max_rank):.1f}, stdev {s.stdev(ac_max_rank):.1f}') u.echo(f,f' sac/ac size: mean {s.mean(sac_ac):.2f}, stdev {s.stdev(sac_ac):.2f}') # used batch size may be different from evidence size due to memory limitations u.echo(f,f'\nused batch size') u.echo(f,f' ac : mean {s.mean(b_AC):.1f}, stddev {s.stdev(b_AC):.1f}') u.echo(f,f' numpy: mean {s.mean(b_numpy):.1f}, stddev {s.stdev(b_numpy):.1f}') u.echo(f,f' tf : mean {s.mean(b_tf):.1f}, stddev {s.stdev(b_tf):.1f}') u.echo(f,f' array: mean {s.mean(b_array):.1f}, stddev {s.stdev(b_array):.1f}') u.echo(f,f'\neval time') u.echo(f,f' ac / 1M : mean {s.mean(ac_per_mill):,}, stdev {s.stdev(ac_per_mill):.1f}') u.echo(f,f' numpy/ac: mean {s.mean(numpy_ac):.1f}, stdev {s.stdev(numpy_ac):.1f}') u.echo(f,f' tf/ac : mean {s.mean(tf_ac):.1f}, stdev {s.stdev(tf_ac):.1f}') u.echo(f,f' array/ac: mean {s.mean(array_ac):.1f}, stdev {s.stdev(array_ac):.1f}') all_time = time.perf_counter() - start_time u.echo(f,f'\n===Total Time: {all_time:.3f} sec (includes skipped circuits)') f.close() u.verbose = saved
def eval(f,size,output,testing): circuit_type = 'TAC' if testing else 'AC' # get data (ground truth) evidence, marginals = rdata.get(size,output) ecount = len(marginals) # number of examples u.echo(f,f'\n==rectangle {size}x{size} images: {ecount} total') # get model bn, inputs = rmodel.get(size,output,testing=testing,use_bk=True,tie_parameters=False) # compile model s_time = time.time() u.echo(f,f'\ncompiling {circuit_type}:',end='') AC = tac.TAC(bn,inputs,output,trainable=False,profile=False) t = time.time()-s_time u.echo(f,f' {t:.1f} sec') u.echo(f,f' {circuit_type} size {AC.size:,}\n (sep) binary rank {AC.binary_rank:.1f}, rank {AC.rank}') # evaluate AC on evidence to get predictions u.echo(f,f'evaluating {circuit_type}:\n',end='',flush=True) predictions, t1, batch_size = AC.evaluate(evidence,report_time=True) u.echo(f,f' batch size {batch_size}') u.echo(f,f' {t1:.2f} sec, {1000*t1/ecount:.1f} ms per example')
def train_all(size,output,tries,data_sizes,testing,use_bk,tie_parameters,batch_size): start_time = time.time() fname = paths.exp / u.time_stamp(f'train_rect_{size}_{output}_{tries}_{testing}_{use_bk}_{tie_parameters}','txt') f = open(fname,'w+') u.echo(f,f'\nrectangle {size} x {size}, output {output}, data_sizes {data_sizes}, testing {testing}, use_bk {use_bk}, tie {tie_parameters}\n') u.echo(f,f'fixed batch size {batch_size}') u.echo(f,'output logged into logs/exp/') def get_data(data_size): # full data t_evidence, t_labels = rdata.get(size,output,noisy_image_count=size,noise_count=size) v_evidence, v_labels = rdata.get(size,output,noisy_image_count=2*size,noise_count=2*size) # random subset t_percentage = data_size / len(t_labels) v_percentage = max(1000,data_size)/len(v_labels) # no less than 1000 t_evidence, t_labels = data.random_subset(t_evidence,t_labels,t_percentage) v_evidence, v_labels = data.random_subset(v_evidence,v_labels,v_percentage) return t_evidence, t_labels, v_evidence, v_labels # get model net, inputs = rmodel.get(size,output,testing,use_bk,tie_parameters) # compile model into circuit circuit = tac.TAC(net,inputs,output,trainable=True,profile=False) u.echo(f,f'circuit size {circuit.size:,}, paramater count {circuit.parameter_count}\n') for data_size, count in zip(data_sizes,tries): u.echo(f,f'==data size {data_size}') t_evidence, t_labels, v_evidence, v_labels = get_data(data_size) u.echo(f,f' train {len(t_labels)}, test {len(v_labels)}') u.echo(f,f' accuracy ({count}):',end='',flush=True) sample = [] for i in range(count): circuit.fit(t_evidence,t_labels,loss_type='CE',metric_type='CA',batch_size=batch_size) acc = 100*circuit.metric(v_evidence,v_labels,metric_type='CA') sample.append(acc) u.echo(f,f' {acc:.2f}',end='',flush=True) u.echo(f,f'\naccuracy mean {s.mean(sample):.2f}, std {s.stdev(sample):.2f}\n') all_time = time.time() - start_time u.echo(f,f'Total Time: {all_time:.3f} sec') f.close()