def get_data(self, url): db = Database() r = requests.get(url, headers=self.headers) soup = BeautifulSoup(r.text, 'html.parser') items = soup.find_all(self.find_tag) LOG.debug('found %s' % len(items)) recs = [] for p in items: cts = p.contents ip = cts[3].string if cts[3].string else '' port = cts[5].string if cts[5].string else '' city = cts[7].a.string if cts[7].a else '' conn_type = cts[11].string if cts[11].string else '' speed = cts[13].div['title'] speed = speed[:len(speed) - 1] o = { 'ip': ip, 'port': port, 'country': 'cn', 'city': city, 'speed': speed, 'conn_type': conn_type } recs.append(o) num_insert, num_update = db.insert_update(recs, 'ippool_exist', 'ippool_insert', 'ippool_update', ['ip', 'port'])
def get_center_pane(self): if self.list is None: self.db = Database() self.list = FactivaListCtrl(self.parent, self.db, style=wx.LC_REPORT | wx.BORDER_NONE) return self.list
def load_data(self): db = Database() r = db.query('ippool_count')[0] c = r.c if r.c is not None else 0 s = r.s if r.s is not None else 0 f = r.f if r.f is not None else 0 self.led_total.set_value(c) self.led_succ.set_value(s) self.led_fail.set_value(f)
def records(self): db = Database() rows = db.query('ippool_all') rows = rows.as_dict() if len(rows) == 0: return [] delt = rows[len(rows) - 1]['speed'] - rows[0]['speed'] rows = [[ d['country'], d['ip'], d['port'], d['city'], 100 - int(100 * (d['speed'] / delt)), d['conn_type'] ] for d in rows] return rows
def OnInit(self): t = Timeit() LOG.debug("OnInit") self.cfg_path = 'cfg.json' # init config file cfg = Config( self.cfg_path, { 'version': '1.0.0', 'database': { 'dbtype': 'sqlite', 'dbname': 'data.db', 'dbuser': '', 'dbpass': '', 'dbaddr': 'localhost', }, 'ippool': { 'urls': ['http://www.xicidaili.com/nn/'], 'interval': 60, } }).data # init db Database(cfg['database']).init_db() frm = RibbonFrame(None, -1, '爬虫工具集 v0.1 [email protected]', size=(1024, 768)) frm.Show() LOG.debug('Elapsed time: %f ms' % t.end()) return True
def update(self, o): idx = o['idx'] rec = o['rec'] db = Database() db.query('ippool_update_valid', id=rec['id'], valid=rec['valid']) db.close() self.data[idx] = rec self.RowChanged(idx)
def OnInit(self): t = Timeit() LOG.info("OnInit") # load locale from cur dir in locale # TODO mac, place locale in app locale = wx.Locale(wx.LANGUAGE_CHINESE_SIMPLIFIED) if locale.IsOk(): locale.AddCatalogLookupPathPrefix('locale') locale.AddCatalog('app') self.cfg_path = 'cfg.json' # init config file self.cfg = Config(self.cfg_path, { 'version': '1.0.0', 'database': { 'dbtype': 'sqlite', 'dbname': 'data.db', 'dbuser': '', 'dbpass': '', 'dbaddr': 'localhost', }, 'ippool': { 'urls': ['http://www.xicidaili.com/nn/'], 'interval': 60, } }).data # init db Database(self.cfg['database']).init_db() # from ui.ribbonframe import RibbonFrame # frm = RibbonFrame(None, -1, '爬虫工具集 v0.1 [email protected]', size=(1024, 768)) # frm.Show() from ui.auiframe import AuiFrame frm = AuiFrame(None, -1, _('Personal Python Tools'), size=(1024, 768)) frm.Show() LOG.info('Elapsed time: %f ms' % t.end()) return True
self.thread_stop = False LOG.debug('Start proxy spider') step = 1 urls = [] for idx, url in enumerate(self.urls): if self.thread_stop: break urls.append(url) if idx % step == 0: LOG.debug(urls) pool = ThreadPool(self.pool_size) results = pool.map(self.proc_func, urls) # close the pool and wait for the work to finish pool.close() pool.join() urls[:] = [] SIG_REFRESH.send(self) time.sleep(self.interval) LOG.debug('Finish proxy spider') def stop(self): self.thread_stop = True if __name__ == '__main__': cfg_path = 'cfg.json' cfg = Config(cfg_path).data Database(cfg['database']).init_db() spider = ProxySpider(cfg_path) spider.start()
class FactivaSpider: name = property(lambda s: s._name, lambda s, v: setattr(s, '_name', v)) def __init__(self, parent, name): self.parent = parent self.name = name self.list = None self.drv = None self.db = None def get_center_pane(self): if self.list is None: self.db = Database() self.list = FactivaListCtrl(self.parent, self.db, style=wx.LC_REPORT | wx.BORDER_NONE) return self.list def crawling(self, drv, usr, pwd): self.drv = drv self.login(usr, pwd) regions = self.get_regions() if regions is None: regions = self.get_regions() for ridx, region in enumerate(regions): #TODO 从非洲开始 if ridx < 11: continue btn = region.find_element_by_tag_name('span') arr = region.find_elements_by_tag_name('a') region_name = arr[0].text.strip() self.scroll_and_click(btn) items = region.find_elements_by_css_selector('div li') for idx, item in enumerate(items): #TODO 从澳大利亚/大洋洲开始 # if region_name == '澳大利亚/大洋洲' and idx < 2: # continue self.process_item(region_name, item) def crawling_cata_industry(self, drv, usr, pwd): self.drv = drv self.login(usr, pwd) self.load_search_page() drv.find_element_by_id('inTab').click() self.wait_loading() self.process_tree('div[id=inMnu] li', self.process_cata_industry) LOG.info('Complete!') def crawling_cata_expert(self, drv, usr, pwd): self.drv = drv self.login(usr, pwd) self.load_search_page() drv.find_element_by_id('fesTab').click() self.wait_loading() self.process_tree('div[id=fesMnu]>ul>li', self.process_cata_expert) LOG.info('Complete!') def crawling_cata_news(self, drv, usr, pwd): self.drv = drv self.login(usr, pwd) self.load_search_page() drv.find_element_by_id('nsTab').click() self.wait_loading() self.process_tree('div[id=nsMnu]>ul>li', self.process_cata_news) LOG.info('Complete!') def crawling_by_industry(self, drv, usr, pwd): self.drv = drv self.login(usr, pwd) self.load_search_page() drv.find_element_by_id('scTab').click() self.wait_loading() select = Select(drv.find_element_by_id('scCat')) select.select_by_visible_text('按行业') self.wait_loading() self.process_tree('div[id=scMnu]>ul>li', self.process_by_industry) LOG.info('Complete!') def process_cata_industry(self, txt, el): # ex. 休闲/艺术/餐饮与酒店业_剧院/娱乐场所_博物馆/古迹/花园 arr = txt.split('_') o = {'uid': None} idx = -1 for idx, s in enumerate(arr): o['cata%s' % idx] = s for i in range(idx + 1, 8): o['cata%s' % i] = '' self.db.insert([o], 'spider_factiva_cata_industry_insert') LOG.info('Insert:' + txt) def process_cata_expert(self, txt, el): arr = txt.split('_') o = {'uid': None} idx = -1 for idx, s in enumerate(arr): o['cata%s' % idx] = s for i in range(idx + 1, 8): o['cata%s' % i] = '' self.db.insert([o], 'spider_factiva_cata_expert_insert') LOG.info('Insert:' + txt) def process_cata_news(self, txt, el): arr = txt.split('_') o = {'uid': None} idx = -1 for idx, s in enumerate(arr): o['cata%s' % idx] = s for i in range(idx + 1, 8): o['cata%s' % i] = '' self.db.insert([o], 'spider_factiva_cata_news_insert') LOG.info('Insert:' + txt) def process_by_industry(self, txt, el): drv = self.drv arr = txt.split('_') o = {'uid': None} idx = -1 for idx, s in enumerate(arr): o['cata%s' % idx] = s for i in range(idx + 1, 8): o['cata%s' % i] = '' max_retry = 3 count = 0 sarr = el.find_elements_by_tag_name('a') process_source_name = sarr[0].text.replace('\n', '').split(':')[1].strip() detail_btn = sarr[1] LOG.info('Process %s' % process_source_name) while count < max_retry: count += 1 popup = self.show_detail_dialog(detail_btn) if popup is None: LOG.error('Fail to popup dialog, try again!') continue raw_text = drv.execute_script("return arguments[0].outerHTML;", popup) soup = BeautifulSoup(raw_text) trs = soup.find_all('tr') if len(trs) < 1: LOG.error('Popup dialog is empty, try again!') continue name = trs[0].find('img').attrs['title'].strip() link = self.find_popup_field_value(raw_text, '网址:') o['name'] = name o['raw_text'] = raw_text o['link'] = link break if count >= max_retry: LOG.error('%s source not processed!' % process_source_name) return self.db.insert([o], 'spider_factiva_by_industry_insert') LOG.info('Get info %s' % o['name']) def login(self, usr, pwd): drv = self.drv drv.get( 'https://global.factiva.com/factivalogin/login.asp?productname=global' ) try: LOG.info('Wait login page show...') WebDriverWait(drv, 30).until( EC.presence_of_element_located( (By.ID, 'darktooltip-undefined'))) LOG.info('Login page show complete.') el_usr = drv.find_element_by_id('email') el_pwd = drv.find_element_by_id('password') el_btn = drv.find_element_by_class_name('sign-in') drv.execute_script("arguments[0].value = '%s';" % usr, el_usr) drv.execute_script("arguments[0].value = '%s';" % pwd, el_pwd) el_btn.click() LOG.info('Wait main page show...') WebDriverWait(drv, 30).until( EC.presence_of_element_located((By.ID, 'dj_new-header'))) LOG.info('Main page header complete.') except Exception as e: traceback.print_exc() def load_search_page(self): drv = self.drv rtn = False try: drv.get('https://global.factiva.com/sb/default.aspx?NAPC=S') LOG.info('Wait search page shown...') WebDriverWait(drv, 30).until( EC.presence_of_element_located((By.ID, 'inpillscontextmenu'))) LOG.info('Search page shown!') rtn = True except Exception as e: LOG.error('Search page not found!') traceback.print_exc() return rtn def process_tree(self, selector, cb=None): drv = self.drv top_items = drv.find_elements_by_css_selector(selector) LOG.info('Top-level items num:%s' % len(top_items)) for idx, item in enumerate(top_items): self.process_tree_item(item, cb, idx_str='%s/%s' % (idx + 1, len(top_items))) def process_tree_item(self, el, cb=None, parent_str='', idx_str=''): if self.is_tree_leaf(el): LOG.info('Process leaf:%s' % idx_str) self.process_item_info(el, cb, parent_str) else: item_btn = el.find_element_by_class_name('mnuBtn') self.scroll_and_click(item_btn) sub_items = el.find_elements_by_xpath('./div/ul/li') for sidx, sub_item in enumerate(sub_items): etxt = self.get_tree_node_text(el) pstr = etxt if parent_str == '' else parent_str + '_' + etxt istr = '%s_%s/%s' % (idx_str, sidx + 1, len(sub_items)) self.process_tree_item(sub_item, cb, pstr, istr) def process_item_info(self, el, cb=None, parent_str=''): txt = parent_str + '_' + self.get_tree_node_text(el) if cb is None: LOG.info(item_text) else: cb(txt, el) def show_detail_dialog(self, detail_btn): drv = self.drv popup = None try: drv.execute_script("arguments[0].scrollIntoView();", detail_btn) action_chains = ActionChains(drv) action_chains.send_keys(Keys.ESCAPE).perform() popup_script = detail_btn.get_attribute('onclick') drv.execute_script(popup_script) WebDriverWait(drv, 3).until( EC.visibility_of_element_located( (By.ID, 'relInfoPopupBalloon__overlay'))) popup = drv.find_element_by_class_name('popup-body') time.sleep(2) except Exception as e: pass return popup def is_tree_leaf(self, el): b = None try: b = el.find_element_by_class_name('mnuBtn') except Exception as e: pass return True if b is None else False def get_tree_node_text(self, el): arr = el.find_elements_by_tag_name('a') txt = arr[0].text.strip() # txt = txt.replace(' ', '') return txt def wait_loading(self, wait=30): drv = self.drv wait_show = 3 wait_hide = wait finish = False div = None for i in range(0, wait_show): time.sleep(1) LOG.info('Try find loading div...') try: div = drv.find_element_by_class_name('mnuMsg') txt = div.text if txt == '正在加载...': break except Exception as e: pass if div is not None: LOG.info('Start loading...') for i in range(0, wait_hide): time.sleep(1) try: div = drv.find_element_by_class_name('mnuMsg') txt = div.text if txt != '正在加载...': finish = True break except Exception as e: finish = True break else: LOG.info('Loading div not found, suppose finish loading') finish = True LOG.info('Finish loading.' if finish else 'Load timeout, suppose finished loading.') def scroll_and_click(self, el): drv = self.drv drv.execute_script("arguments[0].scrollIntoView();", el) time.sleep(1) drv.execute_script("arguments[0].onclick();", el) self.wait_loading() def find_popup_field_value(self, html_str, key): key_val = key.strip() soup = BeautifulSoup(html_str) trs = soup.find_all('tr') rtn = None for idx, tr in enumerate(trs): if idx < 2: continue if len(tr) < 3: continue key_td = tr.contents[0] val_td = tr.contents[2] if key_td.text.strip() == key_val: rtn = val_td.div.text.strip( ) if val_td.div else val_td.text.strip() break return rtn ################################################################################ def get_regions(self): drv = self.drv regions = None try: drv.get('https://global.factiva.com/sb/default.aspx?NAPC=S') LOG.info('Wait search page show...') WebDriverWait(drv, 30).until( EC.presence_of_element_located((By.ID, 'inpillscontextmenu'))) LOG.info('Search page complete.') drv.find_element_by_id('scTab').click() select = Select(drv.find_element_by_id('scCat')) LOG.info('Wait 按地区 show...') select.select_by_visible_text('按地区') self.wait_load() LOG.info('按地区 complete.') regions = drv.find_elements_by_css_selector('div[id=scMnu] li') LOG.info('地区数:%s' % len(regions)) except Exception as e: traceback.print_exc() return regions def process_item(self, region_name, el, index=-1, count=0): if self.is_leaf(el): self.process_source(region_name, el, index, count) else: item_btn = el.find_element_by_tag_name('span') self.scroll_and_click(item_btn) sub_items = el.find_elements_by_css_selector('div li') for sidx, sub_item in enumerate(sub_items): #TODO 英国 # if region_name == '欧洲' and index == -1 and sidx < 16: # continue self.process_item(region_name, sub_item, sidx, len(sub_items)) def process_source(self, region_name, source, index, count): LOG.info('%s %s/%s' % (region_name, index + 1, count)) drv = self.drv sarr = source.find_elements_by_tag_name('a') process_source_name = sarr[0].text.replace('\n', '').split(':')[1].strip() LOG.info('Process %s' % process_source_name) count = 0 popup = None detail_btn = sarr[1] drv.execute_script("arguments[0].scrollIntoView();", detail_btn) popup_script = detail_btn.get_attribute('onclick') drv.execute_script(popup_script) try: # time.sleep(5) WebDriverWait(drv, 3).until( EC.visibility_of_element_located( (By.ID, '_ceprogress__overlay'))) WebDriverWait(drv, 3).until( EC.invisibility_of_element_located( (By.ID, '_ceprogress__overlay'))) popup = drv.find_element_by_class_name('popup-body') except Exception as e: pass time.sleep(3) while popup is None and count < 3: LOG.info('Not found popup, try again for %s' % process_source_name) action_chains = ActionChains(drv) action_chains.send_keys(Keys.ESCAPE).perform() drv.execute_script(popup_script) time.sleep(15) popup = drv.find_element_by_class_name('popup-body') if popup is None: LOG.error('%s source %s not processed!' % process_source_name) return raw_text = drv.execute_script("return arguments[0].outerHTML;", popup) soup = BeautifulSoup(raw_text) trs = soup.find_all('tr') count = 0 while len(trs) < 1 and count < 3: LOG.info('Not found popup, try again for %s' % process_source_name) action_chains = ActionChains(drv) action_chains.send_keys(Keys.ESCAPE).perform() drv.execute_script(popup_script) time.sleep(15) popup = drv.find_element_by_class_name('popup-body') raw_text = drv.execute_script("return arguments[0].outerHTML;", popup) soup = BeautifulSoup(raw_text) trs = soup.find_all('tr') count += 1 if len(trs) < 1: LOG.error('%s source %s not processed!' % (source_idx, process_source_name)) return db = self.db recs = [] country_name = source.find_element_by_xpath( './../../..').find_elements_by_tag_name('a')[0].text.strip() sub_region = source.find_element_by_xpath( './../../../../../..').find_elements_by_tag_name( 'a')[0].text.strip() district = None if sub_region == '美国': district = country_name country_name = sub_region sub_region = region_name name = trs[0].find('img').attrs['title'].strip() desc = self.find_popup_field_value(raw_text, '描述:') code = self.find_popup_field_value(raw_text, '资讯来源代码:') lang = self.find_popup_field_value(raw_text, '语言:') freq = self.find_popup_field_value(raw_text, '频数:') link = self.find_popup_field_value(raw_text, '网址:') o = { 'uid': None, 'name': name, 'region': region_name, 'sub_region': sub_region, 'country': country_name, 'district': district if district is not None else '', 'raw_text': raw_text, 'description': desc, 'source_code': code, 'language': lang, 'frequecy': freq, 'link': link } recs.append(o) action_chains = ActionChains(drv) action_chains.send_keys(Keys.ESCAPE).perform() db.insert_update( recs, 'spider_factiva_region_check', 'spider_factiva_region_insert', 'spider_factiva_region_update', ['region', 'sub_region', 'country', 'district', 'name']) LOG.info('Get info %s' % name) def wait_load(self): try: WebDriverWait(self.browser, 20).until( EC.presence_of_element_located((By.CLASS_NAME, 'mnuMsg'))) LOG.info('Found loading..') WebDriverWait(self.browser, 20).until( EC.invisibility_of_element_located((By.CLASS_NAME, 'mnuMsg'))) LOG.info('Loading gone..') except Exception as e: LOG.info('Wait loading timeout') time.sleep(10) def is_leaf(self, el): b = None try: b = el.find_element_by_class_name('mnuBtn') except Exception as e: pass return True if b is None else False
def update_record(self, rec): ip = rec[1] port = rec[2] valid = rec[4] db = Database() rows = db.query('ippool_update_valid', ip=ip, port=port, valid=valid)
def get_all_record(self): db = Database() rows = db.query('ippool_all') rows = rows.as_dict() rows = [ [i, d['ip'], d['port'], d['conn_type'], d['valid'] ] for i, d in enumerate(rows)] return rows
def instance(cls): db = Database() rows = db.query('ippool_all') rows = rows.as_dict() db.close() return TestGridModel(rows)