Ejemplo n.º 1
0
 def start_requests(self):
     for url in self.start_urls:
         if url in ['http://data.trust.hexun.com/list1.shtml',
               'http://data.trust.hexun.com/list2.shtml',
               'http://data.trust.hexun.com/list3.shtml',
               'http://data.trust.hexun.com/list4.shtml',]:
             yield scrapy.Request(url,
                                  meta={'page':1,'baseUrl':url},
                                  headers = {'User-Agent':ua(os=('linux','win','mac'))},
                                  callback=self.trustListparse)
         if url in ['http://jingzhi.funds.hexun.com/newpj/allpj.aspx']:
             yield scrapy.Request(url,
                                  headers = {'User-Agent':ua(os=('linux','win','mac'))},
                                  callback=self.gradeListparse)
         if url in ['http://data.trust.hexun.com/companylist.shtml']:
             yield scrapy.Request(url,
                                  headers = {'User-Agent':ua(os=('linux','win','mac'))},
                                  callback=self.trustConpanyListparse)
         if url in ['http://stockdata.stock.hexun.com/rzrq/broker.shtml']:
             yield scrapy.Request(url,
                                  headers = {'User-Agent':ua(os=('linux','win','mac'))},
                                  callback=self.rzrqConpanyListparse)
         if url in ['http://data.bank.hexun.com/lccp/AllLccp.aspx?col=fld_issenddate&tag=desc&orderMarks=&page={page}']:
             yield scrapy.Request(url,
                                  headers = {'User-Agent':ua(os=('linux','win','mac'))},
                                  callback=self.BankProListparse)
Ejemplo n.º 2
0
 def start_requests(self):
     page = 1
     for url in self.start_urls:
         #Get页面
         if url in [
                 'https://www.howbuy.com/fund/fundranking/',
                 'https://www.howbuy.com/fund/company/',
                 'https://www.howbuy.com/fund/manager/'
         ]:
             yield scrapy.Request(
                 url,
                 method='GET',
                 headers={'User-Agent': ua(os=('win', 'mac', 'linux'))},
                 cookies=self.cookies,
                 priority=1,
                 meta={'page': page},
             )
         #post页面
         if url in [
                 'https://simu.howbuy.com/mlboard.htm',
                 'https://simu.howbuy.com/manager/',
                 'https://simu.howbuy.com/company/'
         ]:
             page = 1
             data = Con.changeData(url, page)
             yield scrapy.FormRequest(
                 url,
                 method='POST',
                 headers={'User-Agent': ua(os=('win', 'mac', 'linux'))},
                 cookies=self.cookies,
                 formdata=data,
                 priority=1,
                 meta={'page': page},
             )
Ejemplo n.º 3
0
    def parse(self, response):
        page = response.meta['page']
        configs = Con.parseChioce(response.url)
        if page == 1:
            #构造allpage
            allPage = re.search('共(\d+)页', response.text)
            if allPage:
                allPage = int(
                    allPage.group(1)) if allPage.group(1).isdigit() else 1
            else:
                allPage = 1
        else:
            allPage = response.meta['allPage']
        if configs['htmlreplace']:
            #改造html
            strs = Con.replaceHtml(response.text, configs['htmlreplace'])
            response_ = scrapy.Selector(text=strs)
        else:
            response_ = response
        kw = S.select_content(response_, configs['geturl'], response)
        kw = set(kw)
        #
        #        print(kw)
        if kw:
            if configs['method'] == 'get':
                for url in kw:
                    url = configs['format'].format(url)
                    yield scrapy.Request(
                        url,
                        headers={'User-Agent': ua(os=('win', 'mac', 'linux'))},
                        cookies=self.cookies,
                        priority=1,
                        callback=eval(configs['callback']))
        #下一页
        if page < allPage:
            page += 1

            data = Con.changeData(response.url, *eval('page,20,allPage'))
            yield scrapy.FormRequest(
                response.url,
                formdata=data,
                headers={'User-Agent': ua(os=('win', 'mac', 'linux'))},
                cookies=self.cookies,
                priority=1,
                meta={
                    'page': page,
                    'allPage': allPage
                },
            )
Ejemplo n.º 4
0
 def trustConpanyListparse(self, response):
     urls = response.xpath("//ul[@class='clearfix xtList']/li/a/@href").extract()
     for url in urls:
         url = response.urljoin(url)
         yield scrapy.Request(url,
                              callback=self.trustConpanyInfoParse,
                              headers = {'User-Agent':ua(os=('linux','win','mac'))})
Ejemplo n.º 5
0
 def __init__(self):
     self.session = requests.Session()
     self.session.headers.update({
         "User-Agent": ua(),
         "Content-Type": "application/json",
         "Accept-Encoding": "gzip, deflate, br",
     })
     self.apikey = apikey
     self.base_params = dict(key=self.apikey, format='json')
Ejemplo n.º 6
0
 def trustListparse(self, response):
     baseUrl = response.meta['baseUrl']
     page = response.meta['page']
     if not response.meta.get('TotalPage'):
         TotalPage = int(response.xpath('//a[text()="末页"]/@href').extract_first().split('=')[-1])
     else:
         TotalPage = response.meta['TotalPage']
     urls = response.xpath('//a[re:test(@href,"\/\d+\.shtml")]/@href').extract()
     for url in urls:
         url = response.urljoin(url)
         yield scrapy.Request(url,
                              headers = {'User-Agent':ua(os=('linux','win','mac'))},
                              callback = self.trustParse
                              )
     if page<TotalPage:
         page+=1
         NextUrl = baseUrl +'?&index=0&order=1&page={page}'.format(page=page)
         yield scrapy.Request(NextUrl,
                              meta={'page':page,'TotalPage':TotalPage,'baseUrl':baseUrl},
                              headers = {'User-Agent':ua(os=('linux','win','mac'))},
                              callback=self.trustListparse)
Ejemplo n.º 7
0
    def bash_headers(self):
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8',
            'Connection': 'keep-alive',
            'Host': 'qyxy.baic.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': ua(os=('win', ))
        }
        return headers


#if __name__ == '__main__':
#    a = GsxtcxSpider()
#    print(a.bash_headers)
Ejemplo n.º 8
0
 def __init__(self):
     self.ip = pr()
     self.ua = ua()
Ejemplo n.º 9
0
def hdr():
    return {
        'User-Agent': ua(os=('win', 'linux', 'mac')),
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    }
Ejemplo n.º 10
0
def _headers():
    return {'User-Agent': ua(os=('win', 'linux', 'mac')), 'Host': 'xueqiu.com'}
Ejemplo n.º 11
0
def getcookie():
    res = requests.get('https://xueqiu.com/',
                       headers={'User-Agent': ua()},
                       cookies={})
    return res.cookies.get_dict()
Ejemplo n.º 12
0
def get_ua():
    return {'User-Agent': ua(os=('win', 'mac', 'linux'))}
Ejemplo n.º 13
0
 def __init__(self, thread_count=1):
     self.ips = pr(thread_count).thread_ips
     self.uas = ua(thread_count).thread_uas
     self.masks = []
     for i, u in zip(self.ips, self.uas):
         self.masks.append({"address" : i, "user-agent": u})