Example #1
0
 def __init__(self):
     self.Logger = Logger('getAjkData')
     self.user_agents = Headers().user_agents
     self.headers = Headers().headers
     self.cfg = self.utils.pathToConfig()
     self.mysql = Mysql(self.cfg.get('DB', 'DBHOST'),
                        int(self.cfg.get('DB', 'DBPORT')),
                        self.cfg.get('DB', 'DBUSER'),
                        self.cfg.get('DB', 'DBPWD'), 3, 5)
Example #2
0
 def check_db_ip(self):
     mysql = Mysql(self.cfg.get('DB', 'DBHOST'),
                   int(self.cfg.get('DB', 'DBPORT')),
                   self.cfg.get('DB', 'DBUSER'),
                   self.cfg.get('DB', 'DBPWD'), 3, 5)
     Loggers = Logger(special_log_file='checkDbIp')
     while 1 == 1:
         Loggers.Info(u'>>>>>开始检查数据库中已有ip<<<<<')
         sql_select = "SELECT * FROM " + self.cfg.get(
             "DB", "DBNAME") + ".ipProxy LIMIT 1000"
         sql_update = "UPDATE " + self.cfg.get(
             "DB", "DBNAME"
         ) + ".ipProxy SET power = (%s), update_time = (%s) WHERE ip = (%s)"
         sql_delete = "DELETE FROM " + self.cfg.get(
             "DB", "DBNAME") + ".ipProxy WHERE ip = (%s)"
         cur_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
         try:
             try:
                 ipsFromDb = mysql.getMany(sql_select, 1000)
             except BaseException, e:
                 Loggers.Error(u'>>>>>从数据取出所有ip出错' + e.message + '<<<<<')
             Loggers.Info(u'>>>>>取出所有ip<<<<<')
             if ipsFromDb:
                 for item in ipsFromDb:
                     Loggers.Info(u'>>>>>检查' + str(item['ip']) + '|' +
                                  str(item['power']) + '<<<<<')
                     result = self.utils.checkIpForAJK(item['ip'])
                     power = int(item['power'])
                     if result and result['move'] == 'add':
                         powerNew = power + 1
                         Loggers.Info(u'>>>>>更新ip:' + item['ip'] +
                                      '-power从' + str(power) + '更新至' +
                                      str(powerNew) + '<<<<<')
                         mysql.update(sql_update,
                                      (str(powerNew), cur_time, item['ip']))
                     elif result and result['move'] == 'minus' and power > 1:
                         powerNew = power - 1
                         Loggers.Info(u'>>>>>更新ip:' + item['ip'] +
                                      '-power从' + str(power) + '更新至' +
                                      str(powerNew) + '<<<<<')
                         mysql.update(sql_update,
                                      (str(powerNew), cur_time, item['ip']))
                     else:
                         Loggers.Info(u'>>>>>删除ip:' + item['ip'] + '<<<<<')
                         mysql.delete(sql_delete, (item['ip']))
                     mysql.end()
         except BaseException, e:
             Loggers.Error('>>>>> check_db_ip ' + u'出错' + e.message +
                           '<<<<<')
Example #3
0
 def check_all_thread(self, funcs):
     while 1 == 1:
         try:
             Loggers = Logger(special_log_file='checkAllThread')
             Loggers.Info(u'>>>>> 开始检查所有线程 <<<<<')
             for fun in funcs:
                 Loggers.Info(u'>>>>> 检查' + str(fun.getName()) + '线程 <<<<<')
                 if not fun.isAlive():
                     Loggers.Info(u'>>>>> ' + str(fun.getName()) +
                                  '线程停止,重新启动 <<<<<')
                     fun.start()
                 else:
                     Loggers.Info(u'>>>>> ' + str(fun.getName()) +
                                  '线程运行中 <<<<<')
             time.sleep(60 * 60)
         except BaseException, e:
             Loggers.Error('>>>>> check_all_thread ' + u'出错' + e.message +
                           ' <<<<<')
Example #4
0
    def get_ip_from_xici(self):
        Loggers = Logger(special_log_file='getProxyXiCi')
        while 1 == 1:
            try:
                avalibleIpsOneWeb = []
                startGetIpTime = time.time()
                startGetIpTimeFormat = time.strftime(
                    "%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
                title = u'西祠代理'
                Loggers.Info('>>>>> ' + startGetIpTimeFormat + '|' + title +
                             u'|开始抓取ip <<<<<')
                url = 'http://www.xicidaili.com/nn/'
                head = self.headers
                head['user-agent'] = random.choice(self.user_agents)
                try:
                    Loggers.Info('>>>>> ' + title + u'|开始请求url ' + url +
                                 ' <<<<<')
                    r = requests.get(url, timeout=10, headers=head)
                    soup = BeautifulSoup(r.text, "html.parser")
                    list = soup.find('table', attrs={
                        'id': 'ip_list'
                    }).find_all('td')
                    strText = ''
                    ips = []
                    for l in list:
                        content = l.get_text().strip()
                        if re.match(r'^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$',
                                    content):
                            strText = content
                        if re.match(
                                r'^([0-9]|[1-9]\d{1,3}|[1-5]\d{4}|6[0-4]\d{4}|65[0-4]\d{2}|655[0-2]\d|6553[0-5])$',
                                content):
                            strText = strText + ':' + content
                            ips.append(strText)
                    endGetIpTime = time.time()
                    endGetIpTimeFormat = time.strftime(
                        "%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
                    Loggers.Info('>>>>> ' + endGetIpTimeFormat + '|' + title +
                                 u'|结束抓取ip,共抓取' + str(len(ips)) + '条 <<<<<')
                    Loggers.Info('>>>>> ' + endGetIpTimeFormat + '|' + title +
                                 u'|开始检查ip是否可用,抓取共耗时' +
                                 str(endGetIpTime - startGetIpTime) + ' <<<<<')

                    for ip in ips:
                        Loggers.Info(u'>>>>> 开始检查ip:' + str(ip) + ' <<<<<')
                        start = time.time()
                        if self.utils.checkIpForAJK(ip):
                            end = time.time()
                            avalibleIpsOneWeb.append({
                                'source': 'xici',
                                'ip': ip,
                                'time': str(end - start)
                            })
                            Loggers.Info('>>>>> ip:' + str(ip) + u' 可用<<<<<')
                        else:
                            Loggers.Info('>>>>> ip:' + str(ip) + u' 不可用<<<<<')
                    endCheckIpTime = time.time()
                    endCheckIpTimeFormat = time.strftime(
                        "%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
                    Loggers.Info('>>>>> ' + endCheckIpTimeFormat + '|' +
                                 title + u'|结束检查ip是否可用,检查共耗时' +
                                 str(endCheckIpTime - endGetIpTime) + ' <<<<<')
                    Loggers.Info('>>>>> ' + title + u'|成功率:' +
                                 str(len(avalibleIpsOneWeb)) + '-' +
                                 str(len(ips)) + ' <<<<<')
                    Loggers.Info('>>>>> ' + endCheckIpTimeFormat + '|' +
                                 title + u'|结束,抓取到' +
                                 str(len(avalibleIpsOneWeb)) + u'条可用ip,共耗时' +
                                 str(endCheckIpTime - startGetIpTime) +
                                 ' <<<<<')
                    # self.avalibleIps.append(avalibleIpsOneWeb)
                    self.insert_data(Loggers, avalibleIpsOneWeb)
                except BaseException, e:
                    Loggers.Error(u'>>>>> 请求url出错 ' + str(e) + '<<<<<')
            except BaseException, e:
                Loggers.Error(u'>>>>> 抓取ip循环出错 ' + str(e) + '<<<<<')
            time.sleep(10)
Example #5
0
 def get_ip_from_66ip(self):
     Loggers = Logger(special_log_file='getProxy66Ip')
     while 1 == 1:
         try:
             avalibleIpsOneWeb = []
             startGetIpTime = time.time()
             title = u'安小莫'
             startGetIpTimeFormat = time.strftime(
                 "%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
             Loggers.Info('>>>>> ' + startGetIpTimeFormat + '|' + title +
                          u'|开始抓取ip <<<<<')
             url = 'http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=http%3A%2F%2Fwww.66ip.cn%2F%3Fsxb%3D%26tqsl%3D100%26ports%255B%255D2%3D%26ktip%3D%26sxa%3D%26radio%3Dradio%26submit%3D%25CC%25E1%2B%2B%25C8%25A1'
             head = self.headers
             head['user-agent'] = random.choice(self.user_agents)
             iplist = []
             try:
                 Loggers.Info('>>>>> ' + title + u'|开始请求url ' + url +
                              ' <<<<<')
                 r = requests.get(url, timeout=10, headers=head)
                 r.encoding = 'gb2312'
                 p = r'(?:((?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5])\.(?:\d|[1-9]\d|1\d{2}|2[0-5][0-5]))\D+?(6[0-5]{2}[0-3][0-5]|[1-5]\d{4}|[1-9]\d{1,3}|[0-9]))'
                 iplist = re.findall(p, r.text)
                 ips = []
                 for item in iplist:
                     ips.append(item[0] + ':' + item[1])
                 endGetIpTime = time.time()
                 endGetIpTimeFormat = time.strftime(
                     "%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
                 Loggers.Info('>>>>> ' + endGetIpTimeFormat + '|' + title +
                              u'|结束抓取ip,共抓取' + str(len(ips)) + '条 <<<<<')
                 Loggers.Info('>>>>> ' + endGetIpTimeFormat + '|' + title +
                              u'|开始检查ip是否可用,抓取共耗时' +
                              str(endGetIpTime - startGetIpTime) + ' <<<<<')
                 for ip in ips:
                     start = time.time()
                     Loggers.Info(u'>>>>> 开始检查ip:' + str(ip) + ' <<<<<')
                     if self.utils.checkIpForAJK(ip):
                         end = time.time()
                         avalibleIpsOneWeb.append({
                             'source': '66ip',
                             'ip': ip,
                             'time': str(end - start)
                         })
                         Loggers.Info('>>>>> ip:' + str(ip) + u' 可用<<<<<')
                     else:
                         Loggers.Info('>>>>> ip:' + str(ip) + u' 不可用<<<<<')
                 endCheckIpTime = time.time()
                 endCheckIpTimeFormat = time.strftime(
                     "%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
                 Loggers.Info('>>>>> ' + endCheckIpTimeFormat + '|' +
                              title + u'|结束检查ip是否可用,检查共耗时' +
                              str(endCheckIpTime - endGetIpTime) + ' <<<<<')
                 Loggers.Info('>>>>> ' + title + u'|成功率:' +
                              str(len(avalibleIpsOneWeb)) + '-' +
                              str(len(ips)) + ' <<<<<')
                 Loggers.Info('>>>>> ' + endCheckIpTimeFormat + '|' +
                              title + u'|结束,抓取到' +
                              str(len(avalibleIpsOneWeb)) + u'条可用ip,共耗时' +
                              str(endCheckIpTime - startGetIpTime) +
                              ' <<<<<')
                 self.insert_data(Loggers, avalibleIpsOneWeb)
             except BaseException, e:
                 Loggers.Error(u'>>>>> 请求url出错 ' + str(e) + '<<<<<')
         except BaseException, e:
             Loggers.Error(u'>>>>> 抓取ip循环出错 ' + str(e) + '<<<<<')
         time.sleep(10)
Example #6
0
class ajkLoadDataAndInsert():
    city_list = []
    user_agents = []
    headers = {}
    utils = Utils()
    list_data = []
    ips = []
    ipIndex = 0
    ip = {}
    PROXYNAME = 'ipProxy'
    COLUMENAME = 'active_ajk_sec'

    def __init__(self):
        self.Logger = Logger('getAjkData')
        self.user_agents = Headers().user_agents
        self.headers = Headers().headers
        self.cfg = self.utils.pathToConfig()
        self.mysql = Mysql(self.cfg.get('DB', 'DBHOST'),
                           int(self.cfg.get('DB', 'DBPORT')),
                           self.cfg.get('DB', 'DBUSER'),
                           self.cfg.get('DB', 'DBPWD'), 3, 5)

    def load_detail_info_sec(self):
        self.Logger.Info(u'>>>>> 开始抓取详细数据 <<<<<')
        self.ip = self.ips[0]
        for city in self.city_list:
            for page in range(0, int(city['ajk_sec_pages'])):
                city_list_url = city['ajk_sec_url'].replace(
                    '?from=navigation',
                    'p' + str(int(page) + 1) + '/#filtersort')
                self.Logger.Info(u'>>>>> 开始抓取:' + city['city_name'] + '|url:' +
                                 str(city_list_url) + '|ip:' + self.ip['ip'] +
                                 '<<<<<')
                oneCityGetDown = True
                while oneCityGetDown:
                    try:
                        self.Logger.Info(u'>>>>> 使用ip:' + str(self.ip['ip']) +
                                         '<<<<<')
                        proxies = {
                            'http': self.ip['ip'],
                            'https': self.ip['ip']
                        }
                        head = self.headers
                        head['user-agent'] = random.choice(self.user_agents)
                        r = requests.get(city_list_url,
                                         timeout=10,
                                         proxies=proxies,
                                         headers=head)
                        time.sleep(random.random() * 10)
                        soup = BeautifulSoup(r.text, "html.parser")
                        title = soup.find('title').get_text()
                        if '二手房' in title:
                            self.Logger.Info(u'>>>>> ip:' +
                                             str(self.ip['ip']) + u'可用|' +
                                             title + '<<<<<')
                            list = soup.find(attrs={
                                'id': 'houselist-mod-new'
                            }).find_all('li')
                            for l in list[0:]:
                                oneDetailGetDown = True
                                while oneDetailGetDown:
                                    house_title = l.find(attrs={
                                        'class': 'house-title'
                                    }).find('a').attrs['title'].strip()
                                    price = l.find(attrs={
                                        'class': 'price-det'
                                    }).get_text().strip()
                                    try:
                                        detail_url = l.find(
                                            attrs={
                                                'class': 'house-title'
                                            }).find('a').attrs['href']
                                        self.Logger.Info(
                                            u'>>>>> 开始抓取:' + house_title +
                                            '|' + detail_url.split('view/')
                                            [1].split('?')[0] + '|ip:' +
                                            self.ip['ip'] + u'|数据<<<<<')
                                        proxies = {
                                            'http': self.ip['ip'],
                                            'https': self.ip['ip']
                                        }
                                        head['user-agent'] = random.choice(
                                            self.user_agents)
                                        r_detail = requests.get(
                                            detail_url.split('now_time')[0],
                                            timeout=10,
                                            proxies=proxies,
                                            headers=head)
                                        time.sleep(random.random() * 20)
                                        soup_detail = BeautifulSoup(
                                            r_detail.text, "html.parser")
                                        title_detail = soup_detail.find(
                                            'title').get_text()
                                        if '58安居客' in title_detail and '访问验证' not in title_detail:
                                            try:
                                                self.Logger.Info(
                                                    u'>>>>> 开始从列表页获取详情中需要的数据|'
                                                    + title_detail + '<<<<<')
                                                detail_dict = self.get_data(
                                                    soup_detail)
                                                detail_dict['city_id'] = city[
                                                    'city_id']
                                                detail_dict[
                                                    'city_name'] = city[
                                                        'city_name']
                                                detail_dict['source'] = 'ajk'
                                                detail_dict[
                                                    'house_id'] = detail_url.split(
                                                        'view/')[1].split(
                                                            '?')[0]
                                                detail_dict[
                                                    'link_url'] = detail_url.split(
                                                        '?')[0]
                                                detail_dict[
                                                    'title'] = house_title
                                                detail_dict[
                                                    'price'] = self.utils.str_to_num(
                                                        price)
                                                oneDetailGetDown = False
                                                self.insert_update_data(
                                                    detail_dict)
                                            except BaseException, e:
                                                self.Logger.Info(
                                                    u'>>>>> 从列表页获取详情中需要的数据出错' +
                                                    str(e) + '<<<<<')
                                        elif '可能被删除' in title_detail:
                                            self.Logger.Info(u'>>>>> 该链接失效|' +
                                                             title_detail +
                                                             '<<<<<')
                                            oneDetailGetDown = False
                                        else:
                                            self.Logger.Info(
                                                u'>>>>> ip for detail:' +
                                                str(self.ip['ip']) + u'不可用|' +
                                                str(title_detail) + '<<<<<')
                                            result_ip = self.utils.get_active_ip(
                                                self.ips, self.ip, self.Logger,
                                                self.PROXYNAME, self.mysql)
                                            self.ip = result_ip['active_ip']
                                            self.ips = result_ip['ips']
                                    except BaseException, e:
                                        self.Logger.Info(
                                            u'>>>>> ip for detail:' +
                                            str(self.ip['ip']) + u'不可用,超时|' +
                                            str(e) + '<<<<<')
                                        result_ip = self.utils.get_active_ip(
                                            self.ips, self.ip, self.Logger,
                                            self.PROXYNAME, self.mysql)
                                        self.ip = result_ip['active_ip']
                                        self.ips = result_ip['ips']
                            oneCityGetDown = False
                            self.Logger.Info(u'>>>>> ========== city:' +
                                             city['city_name'] + u'第' +
                                             str(int(page) + 1) + u'页' +
                                             u'抓取完成 ========== <<<<<')
                        else: