def start_requests(self): for url in self.start_urls: if url in ['http://data.trust.hexun.com/list1.shtml', 'http://data.trust.hexun.com/list2.shtml', 'http://data.trust.hexun.com/list3.shtml', 'http://data.trust.hexun.com/list4.shtml',]: yield scrapy.Request(url, meta={'page':1,'baseUrl':url}, headers = {'User-Agent':ua(os=('linux','win','mac'))}, callback=self.trustListparse) if url in ['http://jingzhi.funds.hexun.com/newpj/allpj.aspx']: yield scrapy.Request(url, headers = {'User-Agent':ua(os=('linux','win','mac'))}, callback=self.gradeListparse) if url in ['http://data.trust.hexun.com/companylist.shtml']: yield scrapy.Request(url, headers = {'User-Agent':ua(os=('linux','win','mac'))}, callback=self.trustConpanyListparse) if url in ['http://stockdata.stock.hexun.com/rzrq/broker.shtml']: yield scrapy.Request(url, headers = {'User-Agent':ua(os=('linux','win','mac'))}, callback=self.rzrqConpanyListparse) if url in ['http://data.bank.hexun.com/lccp/AllLccp.aspx?col=fld_issenddate&tag=desc&orderMarks=&page={page}']: yield scrapy.Request(url, headers = {'User-Agent':ua(os=('linux','win','mac'))}, callback=self.BankProListparse)
def start_requests(self): page = 1 for url in self.start_urls: #Get页面 if url in [ 'https://www.howbuy.com/fund/fundranking/', 'https://www.howbuy.com/fund/company/', 'https://www.howbuy.com/fund/manager/' ]: yield scrapy.Request( url, method='GET', headers={'User-Agent': ua(os=('win', 'mac', 'linux'))}, cookies=self.cookies, priority=1, meta={'page': page}, ) #post页面 if url in [ 'https://simu.howbuy.com/mlboard.htm', 'https://simu.howbuy.com/manager/', 'https://simu.howbuy.com/company/' ]: page = 1 data = Con.changeData(url, page) yield scrapy.FormRequest( url, method='POST', headers={'User-Agent': ua(os=('win', 'mac', 'linux'))}, cookies=self.cookies, formdata=data, priority=1, meta={'page': page}, )
def parse(self, response): page = response.meta['page'] configs = Con.parseChioce(response.url) if page == 1: #构造allpage allPage = re.search('共(\d+)页', response.text) if allPage: allPage = int( allPage.group(1)) if allPage.group(1).isdigit() else 1 else: allPage = 1 else: allPage = response.meta['allPage'] if configs['htmlreplace']: #改造html strs = Con.replaceHtml(response.text, configs['htmlreplace']) response_ = scrapy.Selector(text=strs) else: response_ = response kw = S.select_content(response_, configs['geturl'], response) kw = set(kw) # # print(kw) if kw: if configs['method'] == 'get': for url in kw: url = configs['format'].format(url) yield scrapy.Request( url, headers={'User-Agent': ua(os=('win', 'mac', 'linux'))}, cookies=self.cookies, priority=1, callback=eval(configs['callback'])) #下一页 if page < allPage: page += 1 data = Con.changeData(response.url, *eval('page,20,allPage')) yield scrapy.FormRequest( response.url, formdata=data, headers={'User-Agent': ua(os=('win', 'mac', 'linux'))}, cookies=self.cookies, priority=1, meta={ 'page': page, 'allPage': allPage }, )
def trustConpanyListparse(self, response): urls = response.xpath("//ul[@class='clearfix xtList']/li/a/@href").extract() for url in urls: url = response.urljoin(url) yield scrapy.Request(url, callback=self.trustConpanyInfoParse, headers = {'User-Agent':ua(os=('linux','win','mac'))})
def __init__(self): self.session = requests.Session() self.session.headers.update({ "User-Agent": ua(), "Content-Type": "application/json", "Accept-Encoding": "gzip, deflate, br", }) self.apikey = apikey self.base_params = dict(key=self.apikey, format='json')
def trustListparse(self, response): baseUrl = response.meta['baseUrl'] page = response.meta['page'] if not response.meta.get('TotalPage'): TotalPage = int(response.xpath('//a[text()="末页"]/@href').extract_first().split('=')[-1]) else: TotalPage = response.meta['TotalPage'] urls = response.xpath('//a[re:test(@href,"\/\d+\.shtml")]/@href').extract() for url in urls: url = response.urljoin(url) yield scrapy.Request(url, headers = {'User-Agent':ua(os=('linux','win','mac'))}, callback = self.trustParse ) if page<TotalPage: page+=1 NextUrl = baseUrl +'?&index=0&order=1&page={page}'.format(page=page) yield scrapy.Request(NextUrl, meta={'page':page,'TotalPage':TotalPage,'baseUrl':baseUrl}, headers = {'User-Agent':ua(os=('linux','win','mac'))}, callback=self.trustListparse)
def bash_headers(self): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Host': 'qyxy.baic.gov.cn', 'Upgrade-Insecure-Requests': '1', 'User-Agent': ua(os=('win', )) } return headers #if __name__ == '__main__': # a = GsxtcxSpider() # print(a.bash_headers)
def __init__(self): self.ip = pr() self.ua = ua()
def hdr(): return { 'User-Agent': ua(os=('win', 'linux', 'mac')), 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }
def _headers(): return {'User-Agent': ua(os=('win', 'linux', 'mac')), 'Host': 'xueqiu.com'}
def getcookie(): res = requests.get('https://xueqiu.com/', headers={'User-Agent': ua()}, cookies={}) return res.cookies.get_dict()
def get_ua(): return {'User-Agent': ua(os=('win', 'mac', 'linux'))}
def __init__(self, thread_count=1): self.ips = pr(thread_count).thread_ips self.uas = ua(thread_count).thread_uas self.masks = [] for i, u in zip(self.ips, self.uas): self.masks.append({"address" : i, "user-agent": u})