def get_proxy(self, proxy_type): ret = {} if 'http' in proxy_type: proxy = get_proxy() if proxy: ret['http'] = "http://" + proxy if 'https' in proxy_type: proxy = get_proxy() if proxy: ret['https'] = "https://" + proxy return ret
def get_result(keyword, page): data = { 'wd': keyword, 'rn': 50, 'pn': (page - 1) * 50 } proxy = get_proxy() proxies = { 'https': 'https://' + proxy, 'http': 'http://' + proxy } url = 'https://www.baidu.com/s?' + urlencode(data) print(url) response = requests.get(url, headers=headers, proxies=proxies) if response.status_code == 200: html = response.text doc = pq(html) results = doc('.result.c-container').items() for result in results: title = result.find('h3.t').text() href = result.find('h3.t a').attr('href') abstract = result.find('.c-abstract').text() url = result.find('.c-showurl').text().replace(' ', '') snapshot = result.find('.m').attr('href') yield dict({ 'title': title, 'href': href, 'abstract': abstract, 'url': url, 'snapshot': snapshot })
def proxy(self): ip = get_proxy() proxies = { 'http': ip, 'https': ip, } return proxies
def first_get(self): global _session _session=requests.session() main_url='https://www.instagram.com' try: _session.get(main_url,proxies=self.use_proxy,verify=True) self.save_cookies() if os.path.exists('cookiefile'):#print('have cookies') self.csrf=self.read_cookies() self.data=self.create_ajax() print(self.data) self.ins() time.sleep(5)#wait for 5 seconds login_client = login(self.u_name, self.passwd) if login_client.do_first() is True: print("[*]Save account to file, congrats!") print(self.data) self.save_account_info(self.u_name, self.passwd) else: pass except: print("[x]Invalid proxy ip! Updating proxy now \n") self.use_proxy=proxy.get_proxy("US") if USE_PROXY else None pass
def get_result(keyword, page): data = { 'q': keyword, 'first': (page - 1) * 50, } proxy = get_proxy() proxies = { 'https': 'https://' + proxy, 'http': 'http://' + proxy } url = 'http://cn.bing.com/search?' + urlencode(data) print(url) response = requests.get(url, headers=headers, proxies=proxies) if response.status_code == 200: html = response.text doc = pq(html) results = doc('li.b_algo').items() for result in results: title = result.find('h2').text() href = result.find('h2 a').attr('href') abstract = result.find('.b_caption').text() url = result.find('cite').text().replace(' ', '') yield dict({ 'title': title, 'href': href, 'abstract': abstract, 'url': url, })
def download(url, proxy=proxy_tool.get_proxy(), num_retries=config.NUM_RETRIES): """ :param url: 网址链接 :param user_agent: 用户代理 :param proxy: 设置代理 :param num_retries: 下载错误重新下载次数 :return: """ time.sleep(config.DELAY) log.info('Downloading:{}'.format(url)) headers = { 'User-agent': choice(ua_list.UA_LIST), } request = urllib2.Request(url, headers=headers) opener = urllib2.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: html = urllib2.urlopen(request).read() except urllib2.URLError as e: log.error('Download error:{}'.format(e)) if proxy: proxy_tool.delete_proxy(proxy) html = None if num_retries > 0: if hasattr(e, 'code') and 400 <= e.code < 600: return download(url, choice(ua_list.UA_LIST), num_retries - 1) return html
def get_page_data(url): p = get_proxy() proxy = {p['schema']: p['address']} useragents = open('useragent.txt').read().split('\n') useragent = {'User-Agent': choice(useragents)} print(useragent) print(proxy) url = get_json(url, useragent, proxy) print(url) items = (find_element(url, 'items')) print(items) sleep(0.5) for keys in items: value = find_element(keys, 'value') print(value) id = find_element(keys, 'id') if id is not None: uri = find_element(keys, 'uri_mweb') url = 'https://www.avito.ru' + str(uri) print(url) location = find_element(keys, 'location') user_type = find_element(keys, 'userType') try: number = get_number(id, useragent, proxy) except: number = "" if not check_number(number): data = { 'id': id, 'number': number, 'url': url, 'location': location, 'userType': user_type } write_csv(data) if find_element(value, 'list'): list = find_element(value, 'list') for k in list: id = find_element(k, 'id') uri = find_element(k, 'uri_mweb') url = 'https://www.avito.ru' + str(uri) location = find_element(k, 'location') user_type = find_element(k, 'userType') print(url) try: number = get_number(id, useragent, proxy) except: number = "" if not check_number(number): data = { 'id': id, 'number': number, 'url': url, 'location': location, 'userType': user_type } write_csv(data) continue
def get_urls(): # TODO - nmdproxy = proxy_lib.get_proxy() proxy = recompute_proxy_data(nmdproxy, 'production') proxy.update(recompute_proxy_data(nmdproxy, 'staging')) for url, app in proxy.items(): protocol = app['protocol'] full_url = "{protocol}://{url}".format(protocol=protocol, url=url) yield full_url, len(proxy)
def main(): value_proxy = proxy.get_proxy(proxy.parse(proxy.get_html(proxy.BASE_URL))) print(value_proxy) topic_count = parse(get_html(FOREIGN_URL, proxies={'http': value_proxy})) print('Результатов поиска: %d (max: 50)' % len(topic_count)) foreign_films = 'Зарубежные фильмы' save_html(foreign_films, topic_count, 'project.html') value_proxy = proxy.get_proxy(proxy.parse(proxy.get_html(proxy.BASE_URL))) print(value_proxy) topic_count = parse(get_html(OUR_URL, proxies={'http': value_proxy})) print('Результатов поиска: %d (max: 50)' % len(topic_count)) our_films = 'Наши фильмы' save_html(our_films, topic_count, 'project.html', mode='a') # до писали фаил
def switch_proxy(self): if self.protocol and self.addr and self.port: proxy.proxy_die(self.protocol, self.addr, self.port) protocol, addr, port = proxy.get_proxy() if self.protocol and self.addr and self.port: self.log(u"切换Proxy: %s:%s => %s:%s" % (self.addr, self.port, addr, port)) self.protocol, self.addr, self.port = protocol, addr, port prxhandler = urllib2.ProxyHandler( { self.protocol:"%s:%s" % (self.addr, self.port) } ) self.opener.add_handler(prxhandler)
def get_books_per_page(page_url): try: px = proxy.get_proxy() book_list_html = requests.get(page_url, headers=header) # book_list_html = requests.get(page_url, proxies=px, headers=header) book_list_content = BeautifulSoup(book_list_html.text, 'html.parser') book_list = book_list_content.find_all('div', class_='info') books = [] for bl in book_list: books.append(bl.find('a')) return books except: return get_books_per_page(page_url)
def first_step(self): global _session use_proxy = proxy.get_proxy(1) #获取代理ip及端口,这个函数会返回一个数组,包含1*10个代理 test = emailget.email() #实例化邮箱获取脚本 new_email = test.get_emailaddress() #获取一个新的邮箱地址 print(new_email) #展示给你看,并且自动写入“注册邮箱” _session = requests.session() #准备建立注册会话 self.register_new() #注册脚本开始运行,今天是4.30,知乎仍未开放邮箱注册,所以test到这一步就停了 test.get_content() #登录一次性邮箱,获取知乎的邮件内容,目前是test,获取的邮件是欢迎邮件,等知乎开放注册后进行修改 test.vertify_email() #自行寻找验证邮件中网址的内容,粘贴到提示处,将会完成验证 account_info.savedata( new_email, password ) #将新的机器人账户邮箱和密码保存到目录下的txt文本中,password参数目前不存在,需要使用re将邮箱str分割
def process_request(self, request, spider): # print(time.strftime('%H:%M:%S')) proxy_config = get_proxy(keep_ip=False)['proxy'] # kwargs['proxies'] = {'http': 'http://%(user)d:%(pwd)s@%(proxy)s' % proxy_config, # print(proxy_config) # print(time.strftime('%H:%M:%S')) # print('proxy_config: ',proxy_config) # request.meta['proxies'] = {'http': 'http://%(user)d:%(pwd)s@%(proxy)s' % proxy_config, # 'https': 'https://%(user)d:%(pwd)s@%(proxy)s' % proxy_config} # 设置代理的主机和端口号 # request.meta['proxy'] = 'http://%s:%d/get-proxy-api' % ('118.190.114.196',8080) # print('proxy_config: ',proxy_config) if proxy_config: request.meta['proxy'] = 'http://%s' % proxy_config
def get(url, retry=3, fpfirst=False, use_proxy=False, render_js=False, headers=None, content_length_limit=None): """ Download a web page via proxy and return as unicode string. A proxy server is automatically retrieved from server and used, unless use_proxy is set to False, where the page will be fetched directly. The proxy status is reported back to server after each successful or failed use. Note: JavaScript renderer is not currently supported. """ schema = url.split('://')[0] if schema == url: LOG.warning('URL schema missing. Assuming HTTP.') url = 'http://' + url elif schema not in ( 'http', 'https', ): # 'ftp'): LOG.error('URL schema "%s" not supported. Returning nothing.' % schema) return None if render_js and (fpfirst or headers != None): LOG.error( 'fpfirst and headers are not supported when render_js is specified. Ignoring.' ) for i in range(retry): if use_proxy: proxy = get_proxy() if not proxy: LOG.warning('No valid proxy to get page. Continuing.') continue else: wait_b4_try(i) proxy = '' if render_js: p = _get_page_phantomjs(url, proxy) else: p = _get_page_requests(url, proxy, fpfirst, headers, content_length_limit) if p: return p return None
def download_from_url(url, output_directory, filename=None, use_cache=True): """Download file from a url and put it under output_directory. :param url: Url that gives response. :type url: str :param output_directory: Directory to put the diagram. :type output_directory: str :param filename: Optional filename for downloaded file. :type filename: str :param use_cache: If there is a cached copy of the file already in the output directory, do not refetch it (True) or force refecth it (False). :type use_cache: bool :returns: File path if success to download, else None :rtype: str """ if filename is None: filename = get_filename(url) LOGGER.info('Download file %s from %s' % (filename, url)) file_path = os.path.join(output_directory, filename) if os.path.exists(file_path) and use_cache: LOGGER.info('File %s exists, not downloading' % file_path) return file_path # Set Proxy in webpage proxy = get_proxy() network_manager = QNetworkAccessManager() if not proxy is None: network_manager.setProxy(proxy) # Download Process # noinspection PyTypeChecker downloader = FileDownloader(network_manager, url, file_path) try: result = downloader.download() except IOError as ex: raise DownloadException(ex) if result[0] is not True: _, error_message = result raise DownloadException(error_message) if os.path.exists(file_path): return file_path else: return None
def get_book_info(book_url): try: px = proxy.get_proxy() book_html = requests.get(book_url, headers=header) # book_html = requests.get(book_url, proxies=px, headers=header) book_content = BeautifulSoup(book_html.text, 'html.parser') book_info = book_content.find('div', class_='subject') if book_info is None: # 一般情况下是代理池中的其中一个ip被封了,正好被我取到了 logger.info('没有爬到这本书的信息,重试:' + book_url) return get_book_info(book_url) return book_info except: logger.info('异常,重试:' + book_url) return get_book_info(book_url)
def get_all_tag_url(): try: px = proxy.get_proxy() # tag_page_html = requests.get(home_url + '/tag/', proxies=px, headers=header) tag_page_html = requests.get(home_url + '/tag/', headers=header) tag_page_content = BeautifulSoup(tag_page_html.text, 'html.parser') tags = tag_page_content.find('div', class_='article').find_all('a') tag_urls = [] for a in tags: if not a.get('href') is None: tag_urls.append(home_url + a.get('href')) del tag_urls[0] return tag_urls except: return get_all_tag_url()
def main(): geturl.main() time.sleep(5) link = random_link() name = raw_input('Enter name: ') hashtag = raw_input('Enter hashtag: ') proxy_choice = raw_input('Do you want to use proxies? (yes/no)') print('Logggin in using\nUsername='******'\nPassword='******'yes'): proxies = proxy.get_proxy() i = random.randint(0, len(proxies) - 1) pr = proxies[i] profile = webdriver.FirefoxProfile() profile.set_preference("network.proxy.type", 1) profile.set_preference("network.proxy.http", pr['ip']) profile.set_preference("network.proxy.http_port", int(pr['port'])) profile.set_preference("network.proxy.ssl", pr['ip']) profile.set_preference("network.proxy.ssl_port", int(pr['port'])) options = setup_headless() browser = webdriver.Firefox(firefox_options=options) login(browser) else: options = setup_headless() browser = webdriver.Firefox(firefox_options=options) login(browser) while True: print('Visitng group URL') browser.get(groupURL) tb = browser.find_element_by_name('xhpc_message_text') message = get_message() msg = prep_message(message, link, name, hashtag) print('Posting') time.sleep(5) print(msg) tb.send_keys(msg) time.sleep(5) try: post_btn = browser.find_element_by_xpath( '/html/body/div[1]/div[3]/div[1]/div/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/div/div[2]/div[1]/div/div/div/div[2]/div/div[2]/div[3]/div/div[2]/div/div[2]/button' ) post_btn.click() break except Exception as e: print(e) browser.close()
def find_max_page(tag_url): try: px = proxy.get_proxy() home_html = requests.get(tag_url, headers=header) # home_html = requests.get(tag_url, proxies=px, headers=header) page_content = BeautifulSoup(home_html.text, 'html.parser') # 取得页数div中倒数第二个标签,即为最大页数 paginator = page_content.find('div', class_='paginator').find_all('a') if paginator is not None: size = int(paginator[len(paginator) - 2].text) else: size = 0 # 豆瓣傻逼,虽然页数很多,但是50页后全都是空的。 if size > 50: size = 50 return size except: return find_max_page(tag_url)
def ttscache_get_proxy(): """Get the proxy. :returns: the get_proxy content. """ if request.method == 'GET': logging.debug("GET request") header, body = get_proxy() if 'filename' in header: logging.debug("Send certificate file") return send_file(BytesIO(body), attachment_filename=header.get('filename'), mimetype=header.get('Content-Type')) else: logging.debug("Send response") response = make_response(body) response.headers['Content-Type'] = header.get('Content-Type') return response
def create_accounts(): while True: logging.info("Getting proxy") proxy = get_proxy() logging.info(f"Got proxy, {proxy}") for _ in range(10): try: driver = get_driver(proxy) register_link = "https://login.aliexpress.com/" driver.get(register_link) set_location_cookie(driver) email, password = create_new_account(driver) with open("accounts.txt", "a") as myfile: myfile.write(f"{email}:{password}\n") driver.close() except Exception as e: logging.warning(e) break
def load_data(url, data, headers): while True: # 从代理池获取代理,若代理连续5次不能使用,从池中删除 pro = proxy.get_proxy() proxies = {"http": "http://{}".format(pro)} print proxies["http"] retry_count = 5 while retry_count > 0: try: t = random.randint(1, 5) time.sleep(t) # 发起请求 response = requests.post(url, data=data, headers=headers, proxies=proxies) text = response.text result = json.loads(text) return result except Exception: retry_count -= 1 proxy.delete_proxy(pro)
def get(self): proxy = Redis().get() while not proxy: get_proxy() proxy = Redis().get() proxy_request = {'http': proxy} try: response = requests.get(self.url,headers=self.head,proxies=proxy_request,timeout=self.timeout) if response.status_code == 200: return response elif response.status_code == 404: return '页面资源无法请求到或不存在' else: print('请求失败,更换代理重新请求') #Redis().remove(proxy) get_proxy() return self.get() except: print('请求失败,更换代理重新请求') #Redis().remove(proxy) get_proxy() return self.get()
'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Host': 'www.zhihu.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36' } use_proxy = proxy.get_proxy(1) #获取代理ip及端口,这个函数会返回一个数组,包含1*10个代理,需要耐心等待 proxy_dic = {} for i in range(1): proxy_dic["http"] = 'http://' + use_proxy[0][i] + ':' + use_proxy[1][ i] #生成了一个代理数组,可以直接被requests调用 class Register(): _session = None def __init__(self): self.first_step() def first_step(self): global _session use_proxy = proxy.get_proxy(1) #获取代理ip及端口,这个函数会返回一个数组,包含1*10个代理
def check_remove_proxy_ip(self, proxy_name, ipstr): from proxy import get_proxy consumer = get_proxy(proxy_name) if not consumer.valid_proxy(ipstr): consumer.remove_proxy(ipstr) return "removed"
def download_file(ddir, url, name=None, **kwargs): temp_name = str(uuid.uuid4()) + '.temp' tries = 10 makedirs(ddir, exist_ok=True) for i in range(tries): try: r = requests.get(url, stream=True, proxies=get_proxy(), **kwargs) r.raw.read = functools.partial(r.raw.read, decode_content=True) r.raise_for_status() # Should retry on connection error with open(join(ddir, temp_name), 'wb+') as file: shutil.copyfileobj(r.raw, file) # filename guessing mimetype, _ = cgi.parse_header(r.headers['content-type']) extension = mimetypes.guess_extension( mimetype, strict=False) if r.headers.get('content-type') else None extension = extension or '.txt' filename = name or r.headers.get( 'x-amz-meta-original-filename') if filename is None: filename = get_filename_from_cd( r.headers.get( 'content-disposition')) or 'Untitled' + extension filename = slugify(filename) # ensure unique filename filename = uniquify(join(ddir, filename)) # content integrity is_image = r.headers.get( 'content-type') == 'image/png' or r.headers.get( 'content-type') == 'image/jpeg' if r.headers.get('content-length') and getsize( join(ddir, temp_name)) < int( r.headers.get('content-length')): reported_size = getsize(join(ddir, temp_name)) downloaded_size = r.headers.get('content-length') raise DownloaderException( f'Downloaded size is less than reported; {downloaded_size} < {reported_size}' ) elif r.headers.get('content-length') is None and is_image: try: im = Image.open(join(ddir, temp_name)) im.verify() im.close() im = Image.open(join(ddir, temp_name)) im.transpose(Image.FLIP_LEFT_RIGHT) im.close() except: raise DownloaderException( 'Image integrity check failed') file.close() rename(join(ddir, temp_name), join(ddir, filename)) return filename, r except requests.HTTPError as e: raise e except: if i < tries - 1: # i is zero indexed continue else: raise break
def __init__(self): self.use_proxy = proxy.get_proxy("US") print("==Instagram-robots-account-generate==\n[*] start") #可以删除
def check_add_proxy_ip(self, proxy_name, ipstr): from proxy import get_proxy consumer = get_proxy(proxy_name) consumer.on_producer_add(ipstr)
def import_posts(key, url='https://api.fanbox.cc/post.listSupporting?limit=50'): conn = psycopg2.connect(host=config.database_host, dbname=config.database_dbname, user=config.database_user, password=config.database_password, cursor_factory=RealDictCursor) scraper_data = requests.get(url, cookies={ 'FANBOXSESSID': key }, headers={ 'origin': 'https://fanbox.cc' }, proxies=get_proxy()).json() if scraper_data.get('body'): for post in scraper_data['body']['items']: parsed_post = FanboxPost(post['id'], None, post) if parsed_post.is_restricted: continue try: file_directory = f"files/fanbox/{post['user']['userId']}/{post['id']}" attachments_directory = f"attachments/fanbox/{post['user']['userId']}/{post['id']}" cursor1 = conn.cursor() cursor1.execute( "SELECT * FROM dnp WHERE id = %s AND service = 'fanbox'", (post['user']['userId'], )) bans = cursor1.fetchall() if len(bans) > 0: continue check_for_flags('fanbox', post['user']['userId'], post['id']) cursor2 = conn.cursor() cursor2.execute( "SELECT * FROM booru_posts WHERE id = %s AND service = 'fanbox'", (post['id'], )) existing_posts = cursor2.fetchall() if len(existing_posts) > 0: continue post_model = { 'id': post['id'], '"user"': post['user']['userId'], 'service': 'fanbox', 'title': post['title'], 'content': parsed_post.body_text, 'embed': {}, 'shared_file': False, 'added': datetime.datetime.now(), 'published': post['publishedDatetime'], 'edited': post['updatedDatetime'], 'file': {}, 'attachments': [] } for i in range(len(parsed_post.embeddedFiles)): if i == 0: filename, _ = download_file( join(config.download_path, file_directory), parsed_post.embeddedFiles[i], cookies={'FANBOXSESSID': key}, headers={'origin': 'https://fanbox.cc'}) post_model['file']['name'] = filename post_model['file'][ 'path'] = f'/{file_directory}/{filename}' else: filename, _ = download_file( join(config.download_path, attachments_directory), parsed_post.embeddedFiles[i], cookies={'FANBOXSESSID': key}, headers={'origin': 'https://fanbox.cc'}) post_model['attachments'].append({ 'name': filename, 'path': f'/{attachments_directory}/{filename}' }) post_model['embed'] = json.dumps(post_model['embed']) post_model['file'] = json.dumps(post_model['file']) for i in range(len(post_model['attachments'])): post_model['attachments'][i] = json.dumps( post_model['attachments'][i]) columns = post_model.keys() data = ['%s'] * len(post_model.values()) data[-1] = '%s::jsonb[]' # attachments query = "INSERT INTO booru_posts ({fields}) VALUES ({values})".format( fields=','.join(columns), values=','.join(data)) cursor3 = conn.cursor() cursor3.execute(query, list(post_model.values())) conn.commit() except DownloaderException: continue conn.close() if scraper_data['body'].get('nextUrl'): import_posts(key, scraper_data['body']['nextUrl'])
class leaguesSpider(scrapy.Spider): logger = logging.getLogger(__name__) logger.info('start generate proxy ip') proxy.get_proxy() name = 'leagues' allowed_domains = ['sodasoccer.com'] start_urls = [ 'http://www.sodasoccer.com/dasai/index.html', ] #获取五大联赛页面url def parse(self, response): #只获取欧洲五大联赛对应的列表元素 leagues = response.xpath('//div[@class="league_box1"][2]/ul/li')[0:5] #获取五大联赛各自的详情页相对地址 leagues_urls = leagues.xpath('div[@class="l_box"]/a/@href').extract() #拼接得到联赛的详情页绝对url for league_url in leagues_urls: url = 'http://www.sodasoccer.com' + league_url self.logger.info(url) #调用爬取联赛信息的函数 yield scrapy.Request(url, callback=self.parse_league) #爬取联赛详情 def parse_league(self, response): clubs = [] league = LeagueItem() #联赛logo league['img_urls'] = [ response.xpath('//div[@class="limg"]/img/@src').extract()[0].split( '?')[0] ] #联赛中文名称 league['name'] = response.xpath( '//h1[@class="lh1"]/text()').extract()[0] #联赛英文名称 league['league_uname'] = response.xpath( '//h2[@class="lh2"]/text()').extract()[0] #联赛下的俱乐部列表 league_clubs = response.xpath( '//div[@class="l_zwq"]/ul/li/p/a/text()').extract() #将列表转化为字符串,避免存入MySQL后中文显示为unicode for club in league_clubs: tmp = club.strip('\r\n\t\t\t').strip() clubs.append(tmp) league['league_clubs'] = clubs #获取各当前联赛下各俱乐部的详情页 clubs_details = response.xpath( '//div[@class="l_zwq"]/ul/li/div[@class="qiuduitu_wb"]/a/@href' ).extract() #递归调用俱乐部信息爬虫函数 for club_details in clubs_details: yield scrapy.Request('http://www.sodasoccer.com' + club_details, callback=self.parse_club) self.logger.info(league) yield league #俱乐部信息爬取 def parse_club(self, response): if response.status == 200: club = ClubItem() #俱乐部所在联赛中文名 club['club_league'] = response.xpath( '//div[@class="leida"]/ul/li[@class="world_fu1_li world_fu1_li_frist"]/span/a/text()' ).extract()[0] #俱乐部logo club['img_urls'] = [ response.xpath('//div[@class="photo"]/img/@src').extract() [0].split('?')[0] ] #俱乐部详情父元素 club_info = response.xpath( '//div[@class="jiben"]/ul[@class="xin"]') #俱乐部中文名称 club['name'] = club_info.xpath('li/text()').extract()[0] #俱乐部英文名称 club['club_uname'] = club_info.xpath('li/text()').extract()[1] #俱乐部主教练 club_manager = club_info.xpath('li/a/text()').extract()[0] if club_manager: club['club_manager'] = club_manager else: club['club_manager'] = '-' #俱乐部球场 soccerfield = club_info.xpath('li/text()').extract()[2] if soccerfield: club['club_soccerfield'] = soccerfield else: club['club_soccerfield'] = '-' try: new_info = response.xpath('//div[@id="lineup_0"]/table') old_info = response.xpath('//div[@id="lineup_1"]/table') #新赛季引进球员 new_players = new_info.xpath('tr/td/a/text()').extract()[::2] #上个赛季阵容 old_players = old_info.xpath('tr/td/a/text()').extract()[::2] #得到所有球员 club['club_players'] = new_players + old_players new_players_details = new_info.xpath( 'tr/td/a/@href').extract()[::2] old_players_details = old_info.xpath( 'tr/td/@href').extract()[::2] players_details = new_players_details + old_players_details self.logger.info('All players of club %s is %s', club['club_uname'], players_details) except: club['club_players'] = '-' if players_details: #递归调用球员信息爬虫 for player_details in players_details: yield scrapy.Request('http://www.sodasoccer.com' + player_details, callback=self.parse_player) self.logger.info(club) yield club else: self.logger.info('get league failed, Try again') yield scrapy.Request(request.url, callback=self.parse_club) #球员信息爬取 def parse_player(self, response): player = PlayerItem() #球员中文名 player['name'] = response.xpath( '//div[@class="detailhead"]/h1/text()').extract()[0] info = response.xpath('//div[@class="jiben"]/ul[@class="xin"]') #球员英文名 player['player_uname'] = info.xpath('li/text()').extract()[0].strip( ':').strip() try: birth_tmp = info.xpath('li/text()').extract()[1].strip().split( '-')[0] #球员生日信息 birth = int(birth_tmp.strip()) this_year = int(datetime.datetime.now().year) #球员年龄 player['player_age'] = this_year - birth except Exception as error: self.logger.info(error) player['player_age'] = 'unknow' #场上位置 player['player_position'] = info.xpath( 'li/span/strong/text()').extract()[0].strip() #国籍 player['player_nationality'] = info.xpath( 'li/span/strong/text()').extract()[3].strip() #身高 player['player_high'] = info.xpath('li/text()').extract()[3].strip() #体重 player['player_weight'] = info.xpath( 'li/span/strong/text()').extract()[2].strip() #身价 player['player_networth'] = info.xpath( 'li/text()').extract()[2].strip() #当前俱乐部 player['player_club'] = response.xpath( '//div[@class="leida"]/ul/li[@class="world_fu1_li world_fu1_li_frist"]/span/a/text()' ).extract()[0].strip() #球衣号码 player_number = response.xpath( '//div[@class="leida"]/ul/li[@class="world_fu1_li world_fu1_li_sec"]/span[@class="world_hao_con world_hao_con1"]/text()' ).extract() if not player_number: player['player_number'] = 'unknow' else: player['player_number'] = player_number[0].strip() #照片 player['img_urls'] = [ response.xpath('//div[@class="photo"]/img/@src').extract() [0].strip().split('?')[0] ] #联赛 player['player_league'] = response.xpath( '//div[@id="career_stat_0"]/table/tr/td/text()').extract( )[0].strip() self.logger.info(player) yield player
freqs=", ".join(str(x) for x in valid_frequencies)) print "So we will use the closest acceptable number of {best_freq}".format( best_freq=best_frequency) return best_frequency if __name__ == '__main__': #fix_monitor_frequency(frequency_threshold=15, new_frequency=60) #fix_monitor_type("SIMPLE", "BROWSER") # Get all the synthetics monitors monitors = {x['uri']: x for x in newrelic.get_synthetics_monitors()} # Pull the proxy databag and parse out the staging and production sites proxy_layer = proxy.get_proxy() monitor_default_frequency = calculate_synthetics_timing(site_entries) fix_monitor_frequency(frequency_threshold=monitor_default_frequency - 1, new_frequency=monitor_default_frequency) for site_url, entry in proxy['production'].items(): # Determine protocol from proxy databag if "ssl" in entry or "ssl_force" in entry: protocol = "https" else: protocol = "http" full_url = "{protocol}://{site_url}".format(protocol=protocol, site_url=site_url) print "Working on {url}...".format(url=full_url) if full_url in monitors.keys():
if request.method == 'GET': logging.debug("GET request") header, body = get_proxy() if 'filename' in header: logging.debug("Send certificate file") return send_file(BytesIO(body), attachment_filename=header.get('filename'), mimetype=header.get('Content-Type')) else: logging.debug("Send response") response = make_response(body) response.headers['Content-Type'] = header.get('Content-Type') return response @APP.route('/health', methods=['GET']) def health(): """Check app health.""" return "OK", 200 if __name__ == '__main__': logging.basicConfig( filename='/var/log/ttscache/app.log', format= '[%(asctime)s][%(levelname)s][%(filename)s@%(lineno)d]->[%(message)s]', level=logging.DEBUG) APP.logger.setLevel(logging.DEBUG) get_proxy() APP.run(host="0.0.0.0", port=80)
logger.addHandler(handler) # 为logger添加handler logger.setLevel(logging.DEBUG) #设置日志保存级别 # 打印到屏幕设定 console = logging.StreamHandler() console.setLevel(logging.INFO) console.setFormatter(formatter) logger.addHandler(console) logging.debug("程序启动,初始化完成") logging.info("开始百度优化访问") p_dic = {'keyword': 'IT运维经验', 'url': 'www.qnjslm.com'} baidu_spider = Baidu_Spider.GetKeyWordUrl(p_dic) ua = Get_UA.get_user_agent() proxy = proxy.GetProxy() while 1: (proxy_code, proxy_ip) = proxy.get_proxy() if proxy_code: if "no proxy" in proxy_ip: logging.warning("IP地址池没有可用代理IP地址,暂停等待") time.sleep(120) (baidu_code, baidu_message) = baidu_spider.man(proxy_ip, ua) if baidu_code == 10: sys.exit(1) else: proxy.delete_proxy(proxy_ip) else: proxy.delete_proxy(proxy_ip)