Exemple #1
0
def retry_crawl(url, isProxy):
    response = None
    logging.error('抓取异常!正在试图重新抓取页面{}'.format(url))
    for i in range(spider_retry_num):
        logging.error('重新抓取第{}次'.format(i + 1))
        try:
            if isProxy:
                proxy = _proxy()
                print('正在使用代理{},抓取页面 {}'.format(proxy, url))
                response = requests.get(url, headers=get_proxy_headers(proxy), proxies=proxy, timeout=spider_timeout)
            else:
                response = requests.get(url, headers=get_headers(), timeout=spider_timeout)
        except requests.exceptions.ProxyError as e:
            # logging.exception(e)
            logging.error(e)
            continue
        except requests.exceptions.ConnectTimeout as e:
            # logging.exception(e)
            logging.error(e)
            continue
        soup = BeautifulSoup(response.text, 'lxml')
        com_all_info = soup.find_all(class_='m_srchList')
        _response = response.text
        if len(com_all_info) > 0:
            break
        # elif '<script>window.location.href=' in _response:  # 操作频繁验证链接
        #     verify_url = re.findall("<script>window.location.href='(.*?)';</script>", _response)[0]
        #     print('由于操作频繁被企查查识别为爬虫,请手动点击此链接验证:{}'.format(verify_url))
        #     # verify(verify_url)
        #     time.sleep(20)
        else:
            logging.error('=================返回异常=================')
            logging.error(response.text)
        time.sleep(random.randint(crawl_interval_mintime, crawl_interval_maxtime))
    return response
Exemple #2
0
 def __init__(self, name=None, mode='rb', fileobj=None):
     
     if mode != 'rb':
         raise NotImplementedError("currently the only supported mode is 'rb'")
     self._fileobj = fileobj or io.open(name, mode)
     self._header_range, self._headers = get_headers(self._fileobj)
     self._ownes_fd = fileobj is not None
Exemple #3
0
    def __init__(self, format_url, num_articles, start=1, step=1):

        logging.basicConfig(filename='log',
                filemode='w',
                level=logging.INFO)
        self.logger = logging.getLogger(__name__)

        # URL with {} in place of page number parameter.
        # e.g. ...&page=3 becomes ...&page={}
        self._format_url = format_url

        # What page to start on
        self._start = start

        # How to much to step up url parameter by each time
        # Usually can be set to 1
        self._step = step

        # The number of articles to download before stopping
        self._num_articles = num_articles

        # Get headers with a randomly chosen user-agent.
        self.headers = get_headers()

        # Set initial referer to homepage of the scraped site
        self.headers['Referer'] = 'http://www.{}'.format(
                urlparse.urlparse(format_url).hostname)
Exemple #4
0
def get_images_set_and_save_ctime(uid:int)->set:
    images_set = set()
    images_doc_list_api = "https://api.vc.bilibili.com/link_draw/v1/doc/doc_list"
    headers = get_headers()
    # proxies = get_success_proxies()

    if os.path.exists(f"{uid}.ctime"):
        with open(f"{uid}.ctime", 'r') as f:
            ctime = f.read()
    else:
        ctime = 0
    ctime = int(ctime) if ctime else 0

    # 第一次单独处理

    current_page_num = 0
    while True:
        params = {
            "uid": uid,
            "page_num": current_page_num,
            "page_size": 30,
            "biz": "all",
        }
        html = requests.get(images_doc_list_api, headers=headers, params=params)
        html.encoding = 'utf-8'
        if html.status_code != 200:
            print("获取图片列表状态码异常!", html.status_code)
            return None
        result = json.loads(html.text)
        items = result['data']['items']
        # 如果为空表示无需继续
        if not items:
            return images_set
        if current_page_num == 0:
            new_ctime = items[0]['ctime']
            new_ctime = int(new_ctime)
            if new_ctime > ctime:
                with open(f"{uid}.ctime", 'w') as f:
                    f.write(str(new_ctime))
            else:
                print("已经存在过去的图片,无需获取图片集!")
                return None
        current_page_num = current_page_num + 1


        for item in items:
            # 先判断是否需要保存
            current_ctime = int(item['ctime'])
            if current_ctime < ctime:
                # 无需保存
                continue
            pictures = item['pictures']
            for picture in pictures:
                images_set.add(picture['img_src'])
    return images_set
Exemple #5
0
def download_one(url:str, uid:int):
    headers = get_headers()
    result = re.search(r"([\w]+).(bmp|jpg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|image)", url)
    if not result:
        print("图片格式匹配失败!", url)
        return None
    file_name = result.group(1)
    format = result.group(2)
    file_name = f"{file_name}.{format}"
    html = requests.get(url, headers=headers)
    if html.status_code != 200:
        print("图片下载失败!", html.status_code)
        return None
    if not os.path.exists(f"images/{uid}/"):
        os.makedirs(f"images/{uid}/")
    with open(f"images/{uid}/{file_name}", 'wb') as f:
        f.write(html.content)
    print("图片下载完成", url)
from headers import get_headers
## This is to get the connection to the nba endpoints correctly (hot fix to work until they push changes)
headers = get_headers()

# Make API Call for players across all seasons for a particular franchise
from nba_api.stats.endpoints import franchiseplayers
## Get all players:
nba2019_2020players = {}
for team in list_teams:
    fp = franchiseplayers.FranchisePlayers(team_id=team_to_id[team], headers=headers)
    players = fp.get_data_frames()[0]
    active_players = players.loc[players['ACTIVE_WITH_TEAM'] == 1]
    ids_per_team = active_players['PERSON_ID'].values
    player_names = active_players['PLAYER'].values
    players_per_team = dict(zip(ids_per_team, player_names))
    nba2019_2020players.update(players_per_team)


## Determine_condition or ret true if you cant
def dc(values, sign, threshold): 
    if len(values) > 0:
        if sign == 'l':
            return np.mean(values) < threshold
        elif sign == "g":
            return np.mean(values) > threshold
    else:
        return True

def classify_players(players):
    three_and_d_players = []
    facilitators = []
Exemple #7
0
import requests, time, random
from headers import get_headers
import data_selector

headers_getter = get_headers()
proxy_pool = ["23.83.243.75:8989", "172.93.33.29:8989", "138.128.199.17:8989"]


#随机代理池选取代理
def random_proxy():
    index = random.randint(0, len(proxy_pool) - 1)
    return {"http": proxy_pool[index], "https": proxy_pool[index]}


#延时
def delay():
    delay_time = random.randint(7, 25)
    #print(delay_time)
    time.sleep(delay_time)


#反爬机制的预浏览得到请求头
def pre_requests():
    delay()
    session = requests.session()
    session.headers.update(headers_getter.get_headers())
    r = session.get("https://angel.co/companies", proxies=random_proxy())
    csrf_token = data_selector.csrf_token(r.text)
    session.headers.update({
        "referer": "https://angel.co/companies",
        "csrf-token": csrf_token[0],
Exemple #8
0
                error_data_list = []
                i += 1
                start_url = base_url + str(name)
                # print(start_url)
                try:
                    print("正在抓取第{}个公司==========================={}".format(i, name))
                    if is_proxy:
                        proxy = _proxy()
                        print('正在使用代理{},抓取页面 {}'.format(proxy, start_url))
                        try:
                            response = requests.get(start_url, headers=get_proxy_headers(proxy), proxies=proxy, timeout=spider_timeout)
                        except Exception as e:
                            response = retry_crawl(start_url, is_proxy)
                    else:
                        try:
                            response = requests.get(start_url, headers=get_headers(), timeout=spider_timeout)
                        except Exception as e:
                            response = retry_crawl(start_url, is_proxy)
                    # if response.status_code != 200:
                    #     error_data_list.append(name)
                    #     print("抓取页面 {},异常 {} 可能被企查查网站反爬拦截了!".format(start_url, response.status_code))
                    #     continue

                    # 获取企业筛选信息链接
                    if response is not None:
                        search_url = get_detail_url(start_url, response, is_proxy)
                        if search_url is None:
                            # print('请求企查查网站操作频繁,被反爬拦截了,需等待一段时间再试!')
                            # error_data_list.append(name)
                            # break
                            raise RuntimeError('请求企查查网站操作频繁,被反爬拦截了,需等待一段时间再试!')