def main(): minnap = 200 maxnap = 800 scale = 1000.0 prefix = '' if len(sys.argv) < 2: with open(path.join(thisdir, 'args.txt'), 'rU') as argslist: lines = argslist.readlines() lines[:] = [line.replace('\r', ' ').replace('\n', ' ') for line in lines] argline = ''.join(lines) sys.argv = [sys.argv[0]] + argline.split() optlist, args = getopt.getopt(sys.argv[1:], 'u:d:m:M:s:i') options = dict(optlist) if '-u' not in options: print_usage() return nap = False if '-m' in options or '-M' in options or '-s' in options: nap = True UserAgent.retrieve(options['-u'], nap=nap, minnap=minnap, maxnap=maxnap, napscale=scale, interactive='-i' in options) dbutils.export_csv(dbutils.dump(), description=options['-d'])
def SendReq(start_url): request = urllib2.Request( url = start_url, headers = {'User-Agent':UserAgent.getAgent()} ) response = urllib2.urlopen(request) return response.read()
async def _action_combo_get_page_content(url, cookies_dir='data/cookies/'): try: #解析url属于那个domain parsed_uri = urlparse(url) cookies_file = "".join([cookies_dir, parsed_uri.netloc, "cookie"]) my_cookie_file = DataFile.read_file_intostr(cookies_file) browser = await launch({ "executablePath": "chromium-browser", "args": ["--no-sandbox"] }) page = await browser.newPage() #读取cookies if (len(my_cookie_file) > 0): my_cookie_object = json.loads(my_cookie_file) print("".join( ["Load ", str(len(my_cookie_object)), " cookie item(s)."])) for row in my_cookie_object: await page.setCookie(row) #设置UA ua_box = UserAgent.UserAgentBox() await page.setUserAgent(ua_box.wap_normal_user) await page.goto(url) new_cookie = await page.cookies() json_cookie = json.dumps(new_cookie) res = await action_get_page_content(page) DataFile.write_full_file(cookies_file, json_cookie) await browser.close() return res except Exception as e: traceback.print_exc() return ""
def Check(): try: for i in proxy: i = i.strip() i = i.split(":") ip = i[0] port = i[1] addr = dict(http=f"http://{ip}:{port}", https=f"https://{ip}:{port}") head = {"User-Agent": users.UserAgent()} try: req = r.get(url, proxies=addr, headers=head, timeout=6) except Exception as e: if "HTTPTunel" in str(e): continue else: print(f"{c.R}Error{c.C}!!!{c.W} {e}\n") exit() if req.status_code == 200: print( f"{c.C}Hit{c.R}_{c.C}Proxy {c.R}=> {c.C}{ip}:{port}{c.W}\n" ) files = open("Hit.txt", "a+") files.write(f"{i}\n") files.close() else: print( f"{c.C}Not{c.R}_{c.C}Found {c.R}=> {c.C}{ip}:{port}{c.W}\n" ) except Exception as e: print(f"{c.R}Error{c.C}!!!{c.W} {e}\n") exit()
def screen(url, select): headers = UserAgent.get_headers() # 随机获取一个headers html = requests.get(url=url, headers=headers) html.encoding = 'gbk' # 网站的编码 html = html.text soup = BeautifulSoup(html, 'lxml') return soup.select(select)
def request(req_url): #time.sleep(2) useragent = {'user-agent': UserAgent.randomUserAgent()} req = requests.get(req_url, headers=useragent) req.raise_for_status req.encoding = req.apparent_encoding return req
def post_pan(form_action, form_post): #进行表单数据的提交,返回一个html页面 post_data = {'e_secret_key': form_post} ua = {'user-agent': UserAgent.randomUserAgent()} post_html = requests.post(form_action, headers=ua, data=post_data) post_html.raise_for_status post_html.encoding = post_html.apparent_encoding return post_html
def get_page_detail(url): try: response = requests.get(url, headers=UserAgent.get_user_agent()) if response.status_code == 200: return response.text return None except RequestException: return None
def spider_home(): """ 获取物品的url以及标题价格 :return: 返回物品编码 """ shop = input("请输入你要搜索的商品:") global headers headers = UserAgent.get_headers() # 随机获取一个headers url = 'https://search.jd.com/Search?keyword={shop}&enc=utf-8&wq=%E5%B0%8F&pvid=469d5d51a3184cc9a053124dc020b31f'.format( shop=shop) try: r = requests.get(url=url, headers=headers).content content = le.HTML(r) href = content.xpath( '//*[@id="J_goodsList"]/ul/li[1]/div/div[1]/a/@href') price = content.xpath( '//*[@id="J_goodsList"]/ul/li[1]/div/div/strong/i/text()') title_01 = content.xpath( '//*[@id="J_goodsList"]/ul/li[1]/div/div/a/em/text()') title = [x.strip() for x in title_01 if x.strip() != ''] # 提取标题 将多余空格和\n去除 re_01 = re.compile(r'\d+') number = re_01.findall(str(href)) shop_price_01 = "".join(price) print("商品价格:" + shop_price_01) # for shop_price in price: # # print("商品价格:" + shop_price) global shop_title # 全局定义商品题目 进行文件改标题 shop_title_01 = "".join(title) print("商品标题:" + shop_title_01) # for shop_title in title: # print("商品标题:" + shop_title) for index in href: global href_shop href_shop = 'http:' + index print(href_shop) for num in number: # print(num) return num except: print('爬取失败')
def get_one_page(url): try: response = requests.get(url, headers=UserAgent.get_user_agent()) if response.status_code == 200: return response.text return None except RequestException as e: print(e.response) return None
def download_image(url): print("正在下载。。。" + url) try: response = requests.get(url, headers=UserAgent.get_user_agent()) if response.status_code == 200: # 二进制 save_image(response.content) return None except RequestException: return None
def creatTopic(data): ua = UserAgent() user_agent = { 'User-agent': ua.random, 'Referer': 'https://dealbub.com/', 'Content-type': 'content_type_value' } session = Session() session.head('https://dealbub.com/') data = topicContect() response = session.post(url='https://alisdeals.com/posts/', headers=user_agent, data=data)
def get_page(offset, keywords): params = { 'offset': offset, 'format': 'json', 'keyword': keywords, 'autoload': 'true', 'count': '20', 'cur_tab': '3', 'from': 'gallery' } url = 'https://www.toutiao.com/search_content/?' + urlencode(params) try: response = requests.get(url, headers=UserAgent.get_user_agent()) if response.status_code == 200: return response.text return None except RequestException: return None
def screenPage(url, select): html = requests.get(url=url, headers=UserAgent.get_headers()) html.encoding = 'gbk' html = html.text soup = BeautifulSoup(html, 'lxml') return soup.select(select)[0].next_sibling.text
res = res + str(i) + "=" + str(cookie[i]) + ";" res = res[0:-1] return res times = 0 success = 0 ipSets = [] dupCount = 0 while (True): # print(getTrace10(125)) # exit(0) try: session = requests.Session() cookie = {} userAgent = UserAgent.getPCUserAgent() p = getProxy() p = json.loads(p) print(p[0]) if p[0] in ipSets: dupCount += 1 print("duplicated IP " + str(p[0])) print("dupCount : " + str(dupCount)) else: ipSets.append(p[0]) proxy = {"http": +str(p[0]), "https": "" + str(p[0])} header = getHeader() header['User-Agent'] = userAgent header['Cookie'] = getCookie() timeStamp = int(round(time.time() * 1000))
def get_user_agent(): """ 获取头协议 """ user_agent = {'User-Agent': userAgent.get_user_agent()} return user_agent
import scrapy import json import os import time import random import jd_home1 import jd_main import UserAgent number = jd_home1.spider_home() # 获取产品id comment_file_path = '../../../id.txt' # 文件存储位置 headers = UserAgent.get_headers() # 随机获取headers表头 class JdHomeSpider(scrapy.Spider): name = 'jd_home' # allowed_domains = [' '] # start_urls = ['http:// /'] def start_requests(self): number_page = 10 try: for page in range(1, number_page): url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={number}' \ '&score=0&sortType=5&page={page}&pageSize=10&isShadowSku=0&fold=1'.format( page=page, number=number ) yield scrapy.Request( url=url,
def get_user_agent(): """ 获取头协议 """ user_agent = {"User-Agent": userAgent.get_user_agent()} return user_agent
import telegram import sys import proxies from scraper_api import ScraperAPIClient from proxies import random_proxy from copy import copy from lxml import html from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from datetime import date, datetime, timedelta client = ScraperAPIClient('1911aafa4321c1afdbff5112a4d6685e') #iplist = ['162.208.48.84:8118','165.138.4.41:8080'] #proxies = {'https':random.choice(iplist) } #print(proxies) ua = UserAgent.UserAgent() #imported_proxy = random_proxy.random_proxies() #print(imported_proxy) intervalTimeBetweenCheck = 0 dateIndex = datetime.now() emailinfo = {} IFTTT_Key = "" IFTTT_EventName = "" # msg_content format # msg_content['Subject'] = 'Subject' # msg_content['Content'] = 'This is a content' def isbotalive():
#Models import requests as r, Color as c, UserAgent as usr, Logo #Headers headers = { 'Host': 'www.instagram.com', 'User-Agent': usr.UserAgent(), 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://www.instagram.com/', 'X-CSRFToken': '', 'X-Instagram-AJAX': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'X-Requested-With': 'XMLHttpRequest', 'Content-Length': '', 'Cookie': '', 'Connection': 'keep-alive' } #####.... #Globals Values cambo = open(input(f"{c.C}Cambo{c.R}_{c.C}List {c.R}=> {c.W}")) ######## proxy = open(input(f"{c.C}Proxy{c.R}_{c.C}List {c.R}=> {c.W}"), "r").readlines() #Proxies def Proxies(num): pr = proxy[num].strip("\n") return pr