def __init__(self): self.Logger = Logger('getAjkData') self.user_agents = Headers().user_agents self.headers = Headers().headers self.cfg = self.utils.pathToConfig() self.mysql = Mysql(self.cfg.get('DB', 'DBHOST'), int(self.cfg.get('DB', 'DBPORT')), self.cfg.get('DB', 'DBUSER'), self.cfg.get('DB', 'DBPWD'), 3, 5)
def check_db_ip(self): mysql = Mysql(self.cfg.get('DB', 'DBHOST'), int(self.cfg.get('DB', 'DBPORT')), self.cfg.get('DB', 'DBUSER'), self.cfg.get('DB', 'DBPWD'), 3, 5) Loggers = Logger(special_log_file='checkDbIp') while 1 == 1: Loggers.Info(u'>>>>>开始检查数据库中已有ip<<<<<') sql_select = "SELECT * FROM " + self.cfg.get( "DB", "DBNAME") + ".ipProxy LIMIT 1000" sql_update = "UPDATE " + self.cfg.get( "DB", "DBNAME" ) + ".ipProxy SET power = (%s), update_time = (%s) WHERE ip = (%s)" sql_delete = "DELETE FROM " + self.cfg.get( "DB", "DBNAME") + ".ipProxy WHERE ip = (%s)" cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) try: try: ipsFromDb = mysql.getMany(sql_select, 1000) except BaseException, e: Loggers.Error(u'>>>>>从数据取出所有ip出错' + e.message + '<<<<<') Loggers.Info(u'>>>>>取出所有ip<<<<<') if ipsFromDb: for item in ipsFromDb: Loggers.Info(u'>>>>>检查' + str(item['ip']) + '|' + str(item['power']) + '<<<<<') result = self.utils.checkIpForAJK(item['ip']) power = int(item['power']) if result and result['move'] == 'add': powerNew = power + 1 Loggers.Info(u'>>>>>更新ip:' + item['ip'] + '-power从' + str(power) + '更新至' + str(powerNew) + '<<<<<') mysql.update(sql_update, (str(powerNew), cur_time, item['ip'])) elif result and result['move'] == 'minus' and power > 1: powerNew = power - 1 Loggers.Info(u'>>>>>更新ip:' + item['ip'] + '-power从' + str(power) + '更新至' + str(powerNew) + '<<<<<') mysql.update(sql_update, (str(powerNew), cur_time, item['ip'])) else: Loggers.Info(u'>>>>>删除ip:' + item['ip'] + '<<<<<') mysql.delete(sql_delete, (item['ip'])) mysql.end() except BaseException, e: Loggers.Error('>>>>> check_db_ip ' + u'出错' + e.message + '<<<<<')
def check_all_thread(self, funcs): while 1 == 1: try: Loggers = Logger(special_log_file='checkAllThread') Loggers.Info(u'>>>>> 开始检查所有线程 <<<<<') for fun in funcs: Loggers.Info(u'>>>>> 检查' + str(fun.getName()) + '线程 <<<<<') if not fun.isAlive(): Loggers.Info(u'>>>>> ' + str(fun.getName()) + '线程停止,重新启动 <<<<<') fun.start() else: Loggers.Info(u'>>>>> ' + str(fun.getName()) + '线程运行中 <<<<<') time.sleep(60 * 60) except BaseException, e: Loggers.Error('>>>>> check_all_thread ' + u'出错' + e.message + ' <<<<<')
def get_ip_from_xici(self): Loggers = Logger(special_log_file='getProxyXiCi') while 1 == 1: try: avalibleIpsOneWeb = [] startGetIpTime = time.time() startGetIpTimeFormat = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(time.time())) title = u'西祠代理' Loggers.Info('>>>>> ' + startGetIpTimeFormat + '|' + title + u'|开始抓取ip <<<<<') url = 'http://www.xicidaili.com/nn/' head = self.headers head['user-agent'] = random.choice(self.user_agents) try: Loggers.Info('>>>>> ' + title + u'|开始请求url ' + url + ' <<<<<') r = requests.get(url, timeout=10, headers=head) soup = BeautifulSoup(r.text, "html.parser") list = soup.find('table', attrs={ 'id': 'ip_list' }).find_all('td') strText = '' ips = [] for l in list: content = l.get_text().strip() if re.match(r'^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$', content): strText = content if re.match( r'^([0-9]|[1-9]\d{1,3}|[1-5]\d{4}|6[0-4]\d{4}|65[0-4]\d{2}|655[0-2]\d|6553[0-5])$', content): strText = strText + ':' + content ips.append(strText) endGetIpTime = time.time() endGetIpTimeFormat = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(time.time())) Loggers.Info('>>>>> ' + endGetIpTimeFormat + '|' + title + u'|结束抓取ip,共抓取' + str(len(ips)) + '条 <<<<<') Loggers.Info('>>>>> ' + endGetIpTimeFormat + '|' + title + u'|开始检查ip是否可用,抓取共耗时' + str(endGetIpTime - startGetIpTime) + ' <<<<<') for ip in ips: Loggers.Info(u'>>>>> 开始检查ip:' + str(ip) + ' <<<<<') start = time.time() if self.utils.checkIpForAJK(ip): end = time.time() avalibleIpsOneWeb.append({ 'source': 'xici', 'ip': ip, 'time': str(end - start) }) Loggers.Info('>>>>> ip:' + str(ip) + u' 可用<<<<<') else: Loggers.Info('>>>>> ip:' + str(ip) + u' 不可用<<<<<') endCheckIpTime = time.time() endCheckIpTimeFormat = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(time.time())) Loggers.Info('>>>>> ' + endCheckIpTimeFormat + '|' + title + u'|结束检查ip是否可用,检查共耗时' + str(endCheckIpTime - endGetIpTime) + ' <<<<<') Loggers.Info('>>>>> ' + title + u'|成功率:' + str(len(avalibleIpsOneWeb)) + '-' + str(len(ips)) + ' <<<<<') Loggers.Info('>>>>> ' + endCheckIpTimeFormat + '|' + title + u'|结束,抓取到' + str(len(avalibleIpsOneWeb)) + u'条可用ip,共耗时' + str(endCheckIpTime - startGetIpTime) + ' <<<<<') # self.avalibleIps.append(avalibleIpsOneWeb) self.insert_data(Loggers, avalibleIpsOneWeb) except BaseException, e: Loggers.Error(u'>>>>> 请求url出错 ' + str(e) + '<<<<<') except BaseException, e: Loggers.Error(u'>>>>> 抓取ip循环出错 ' + str(e) + '<<<<<') time.sleep(10)
def get_ip_from_66ip(self): Loggers = Logger(special_log_file='getProxy66Ip') while 1 == 1: try: avalibleIpsOneWeb = [] startGetIpTime = time.time() title = u'安小莫' startGetIpTimeFormat = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(time.time())) Loggers.Info('>>>>> ' + startGetIpTimeFormat + '|' + title + u'|开始抓取ip <<<<<') url = 'http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=http%3A%2F%2Fwww.66ip.cn%2F%3Fsxb%3D%26tqsl%3D100%26ports%255B%255D2%3D%26ktip%3D%26sxa%3D%26radio%3Dradio%26submit%3D%25CC%25E1%2B%2B%25C8%25A1' head = self.headers head['user-agent'] = random.choice(self.user_agents) iplist = [] try: Loggers.Info('>>>>> ' + title + u'|开始请求url ' + url + ' <<<<<') r = requests.get(url, timeout=10, headers=head) r.encoding = 'gb2312' p = r'(?:((?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5]))\D+?(6[0-5]{2}[0-3][0-5]|[1-5]\d{4}|[1-9]\d{1,3}|[0-9]))' iplist = re.findall(p, r.text) ips = [] for item in iplist: ips.append(item[0] + ':' + item[1]) endGetIpTime = time.time() endGetIpTimeFormat = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(time.time())) Loggers.Info('>>>>> ' + endGetIpTimeFormat + '|' + title + u'|结束抓取ip,共抓取' + str(len(ips)) + '条 <<<<<') Loggers.Info('>>>>> ' + endGetIpTimeFormat + '|' + title + u'|开始检查ip是否可用,抓取共耗时' + str(endGetIpTime - startGetIpTime) + ' <<<<<') for ip in ips: start = time.time() Loggers.Info(u'>>>>> 开始检查ip:' + str(ip) + ' <<<<<') if self.utils.checkIpForAJK(ip): end = time.time() avalibleIpsOneWeb.append({ 'source': '66ip', 'ip': ip, 'time': str(end - start) }) Loggers.Info('>>>>> ip:' + str(ip) + u' 可用<<<<<') else: Loggers.Info('>>>>> ip:' + str(ip) + u' 不可用<<<<<') endCheckIpTime = time.time() endCheckIpTimeFormat = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(time.time())) Loggers.Info('>>>>> ' + endCheckIpTimeFormat + '|' + title + u'|结束检查ip是否可用,检查共耗时' + str(endCheckIpTime - endGetIpTime) + ' <<<<<') Loggers.Info('>>>>> ' + title + u'|成功率:' + str(len(avalibleIpsOneWeb)) + '-' + str(len(ips)) + ' <<<<<') Loggers.Info('>>>>> ' + endCheckIpTimeFormat + '|' + title + u'|结束,抓取到' + str(len(avalibleIpsOneWeb)) + u'条可用ip,共耗时' + str(endCheckIpTime - startGetIpTime) + ' <<<<<') self.insert_data(Loggers, avalibleIpsOneWeb) except BaseException, e: Loggers.Error(u'>>>>> 请求url出错 ' + str(e) + '<<<<<') except BaseException, e: Loggers.Error(u'>>>>> 抓取ip循环出错 ' + str(e) + '<<<<<') time.sleep(10)
class ajkLoadDataAndInsert(): city_list = [] user_agents = [] headers = {} utils = Utils() list_data = [] ips = [] ipIndex = 0 ip = {} PROXYNAME = 'ipProxy' COLUMENAME = 'active_ajk_sec' def __init__(self): self.Logger = Logger('getAjkData') self.user_agents = Headers().user_agents self.headers = Headers().headers self.cfg = self.utils.pathToConfig() self.mysql = Mysql(self.cfg.get('DB', 'DBHOST'), int(self.cfg.get('DB', 'DBPORT')), self.cfg.get('DB', 'DBUSER'), self.cfg.get('DB', 'DBPWD'), 3, 5) def load_detail_info_sec(self): self.Logger.Info(u'>>>>> 开始抓取详细数据 <<<<<') self.ip = self.ips[0] for city in self.city_list: for page in range(0, int(city['ajk_sec_pages'])): city_list_url = city['ajk_sec_url'].replace( '?from=navigation', 'p' + str(int(page) + 1) + '/#filtersort') self.Logger.Info(u'>>>>> 开始抓取:' + city['city_name'] + '|url:' + str(city_list_url) + '|ip:' + self.ip['ip'] + '<<<<<') oneCityGetDown = True while oneCityGetDown: try: self.Logger.Info(u'>>>>> 使用ip:' + str(self.ip['ip']) + '<<<<<') proxies = { 'http': self.ip['ip'], 'https': self.ip['ip'] } head = self.headers head['user-agent'] = random.choice(self.user_agents) r = requests.get(city_list_url, timeout=10, proxies=proxies, headers=head) time.sleep(random.random() * 10) soup = BeautifulSoup(r.text, "html.parser") title = soup.find('title').get_text() if '二手房' in title: self.Logger.Info(u'>>>>> ip:' + str(self.ip['ip']) + u'可用|' + title + '<<<<<') list = soup.find(attrs={ 'id': 'houselist-mod-new' }).find_all('li') for l in list[0:]: oneDetailGetDown = True while oneDetailGetDown: house_title = l.find(attrs={ 'class': 'house-title' }).find('a').attrs['title'].strip() price = l.find(attrs={ 'class': 'price-det' }).get_text().strip() try: detail_url = l.find( attrs={ 'class': 'house-title' }).find('a').attrs['href'] self.Logger.Info( u'>>>>> 开始抓取:' + house_title + '|' + detail_url.split('view/') [1].split('?')[0] + '|ip:' + self.ip['ip'] + u'|数据<<<<<') proxies = { 'http': self.ip['ip'], 'https': self.ip['ip'] } head['user-agent'] = random.choice( self.user_agents) r_detail = requests.get( detail_url.split('now_time')[0], timeout=10, proxies=proxies, headers=head) time.sleep(random.random() * 20) soup_detail = BeautifulSoup( r_detail.text, "html.parser") title_detail = soup_detail.find( 'title').get_text() if '58安居客' in title_detail and '访问验证' not in title_detail: try: self.Logger.Info( u'>>>>> 开始从列表页获取详情中需要的数据|' + title_detail + '<<<<<') detail_dict = self.get_data( soup_detail) detail_dict['city_id'] = city[ 'city_id'] detail_dict[ 'city_name'] = city[ 'city_name'] detail_dict['source'] = 'ajk' detail_dict[ 'house_id'] = detail_url.split( 'view/')[1].split( '?')[0] detail_dict[ 'link_url'] = detail_url.split( '?')[0] detail_dict[ 'title'] = house_title detail_dict[ 'price'] = self.utils.str_to_num( price) oneDetailGetDown = False self.insert_update_data( detail_dict) except BaseException, e: self.Logger.Info( u'>>>>> 从列表页获取详情中需要的数据出错' + str(e) + '<<<<<') elif '可能被删除' in title_detail: self.Logger.Info(u'>>>>> 该链接失效|' + title_detail + '<<<<<') oneDetailGetDown = False else: self.Logger.Info( u'>>>>> ip for detail:' + str(self.ip['ip']) + u'不可用|' + str(title_detail) + '<<<<<') result_ip = self.utils.get_active_ip( self.ips, self.ip, self.Logger, self.PROXYNAME, self.mysql) self.ip = result_ip['active_ip'] self.ips = result_ip['ips'] except BaseException, e: self.Logger.Info( u'>>>>> ip for detail:' + str(self.ip['ip']) + u'不可用,超时|' + str(e) + '<<<<<') result_ip = self.utils.get_active_ip( self.ips, self.ip, self.Logger, self.PROXYNAME, self.mysql) self.ip = result_ip['active_ip'] self.ips = result_ip['ips'] oneCityGetDown = False self.Logger.Info(u'>>>>> ========== city:' + city['city_name'] + u'第' + str(int(page) + 1) + u'页' + u'抓取完成 ========== <<<<<') else: