Example #1
0
 def __init__(self, site):
     '''
     初始化
     '''
     self.conn = redisPool.getRedis()  # 初始化一个redis的连接
     # self.new_url = self.conn.sadd()
     self.new_url = site + '_new_urls'
     self.old_url = site + '_old_urls'
Example #2
0
 def __init__(self):
     self.connR = getRedis()  #建立一个redis链接
     self.proxies = {
         'http': 'http://',
         'https': 'https://'
     }  #代理,之后有代理ip后,就直接用代理ip
     self.ips = 'ip_new'  #存放爬取下来ip的集合名字
     self.ip_able = 'ip_able'  #存放可用集合
def able_ip():
    found = False
    connR = getRedis()
    url = 'https://ip.cn/'
    while not found:
        proxy = connR.spop('ip_able')
        control()
        go_ip, port = proxy.split(':')
        proxy_test = {'http':'http://'+proxy,'https':'https://'+proxy}

        res = requests.get(url,headers=get_agent(),proxies=proxy_test,timeout=2)
        if res.status_code == 200:
            res.encoding = 'utf-8'
            soup = BeautifulSoup(res.text, 'html.parser')
            script = soup.find('div', 'container-fluid').find_next('script').string
            ip = re.findall(r'\d+.\d+.\d+.\d+', script)[0]
            print('爬取到的ip值是:', ip, '当前代理ip是', go_ip)
            if ip == go_ip:
                print('########测试通过,当前ip是', go_ip)
                found = True
                return ip

# ip = able_ip()
# print(ip)
Example #4
0
#encoding=utf-8
from django.test import TestCase

# Create your tests here.
from db import DataClass

db_kanzhun = DataClass.DataClass('kanzhunDB')
company = '荔枝微课'

from db.redisPool import getRedis
db = getRedis()
# tags = db.smembers('boss_old_urls')
# print(type(tags),tags)
# for i in tags:
#     print(i)

db_boss = DataClass.DataClass('bossDB', 'bossDB')
data_boss = list(
    DataClass.DataClass('kanzhunDB').connM1[company].find(
        {'title': 'company'}, {'interviewDegree': 1}))
# print(data_boss.count())
# for i in data_boss:
#     # print(i['content'][10])
#     print('***********************************')
#     i['match'] = 1.1
#     print(i)
# li = list(data_boss)
print('列表:&&&&&&&&&&&&&7', data_boss)
for i in data_boss:
    print(i)
# dblist = db_kanzhun.connM1.collection_names()
Example #5
0
#coding=utf-8
'''
*************************
file:       AnalysisJobs IPValidate
author:     gongyi
date:       2019/7/19 11:41
****************************
change activity:
            2019/7/19 11:41
'''
#检验ip是否可用
import telnetlib
from db.redisPool import getRedis
from job.spider.spiderHelper import get_agent

connR = getRedis()


def get_ip():
    '''
    获取一个ip
    :return: ip
    '''
    count = connR.scard('ip_new')
    print('当前有{}个待测ip'.format(count))
    if connR.scard('ip_new') < 10:
        # 如果剩余待检测ip数量小于10,就异步启动爬虫协程
        print('待测ip数量不足,开始爬取待测ip')
        pro.start()
    ip = connR.spop('ip_new')
    return ip