def start_requests(self): if self.cache: self.logger.info('Loading Exposure URLs From File %s.' % self.cache) self.start_urls = read_cache('cache', self.cache) for url in self.start_urls: yield self.make_requests_from_url(url)
def start_requests(self): if self.cache: self.logger.info('Loading New URLs From File %s.' % self.cache) self.start_urls = read_cache('cache', self.cache) #super(XinwenSpider, self).start_requests() for url in self.start_urls: yield self.make_requests_from_url(url)
def __init__(self,cache='news_cache',from_date=news.settings.START_DATE,to_date="2099-12-31",**kwargs): self.logger.debug(locals()) self.cache=cache+".ch" self.from_date=from_date self.to_date=to_date self.__dict__.update(kwargs) ##important self.start_urls=read_cache('news_cache',self.cache)
def start_requests(self): if self.cache: self.logger.info("Loading New URLs From File %s." % self.cache) self.start_urls = read_cache("cache", self.cache) # super(XinwenSpider, self).start_requests() for url in self.start_urls: yield self.make_requests_from_url(url)
def start_requests(self): token = '' lines = read_cache('tokens', (self.plat_id or 'test')+'.tk') if self.need_token and lines: token = lines[0] url = self.start_formated_url.format(token=token) from scrapy.http import Request cookies = parse_cookies(lines[1]) yield Request(url, cookies=cookies)
def start_requests(self): token = '' lines = read_cache('tokens', (self.plat_id or 'test') + '.tk') if self.need_token and lines: token = lines[0] url = self.start_formated_url.format(token=token) from scrapy.http import Request cookies = parse_cookies(lines[1]) yield Request(url, cookies=cookies)
def start_requests(self): for i in self.shortlist: token = '' lines = read_cache('tokens', (self.plat_id or 'test') + '.tk') if self.need_token and lines: token = lines[0] url = self.start_formated_url.format(token=token, page_index=i) from scrapy.http import Request cookies = parse_cookies(lines[1]) yield Request(url, cookies=cookies, dont_filter=True)
def start_requests(self): for i in self.shortlist: token = '' lines = read_cache('tokens', (self.plat_id or 'test')+'.tk') if self.need_token and lines: token = lines[0] url = self.start_formated_url.format(token=token, page_index=i) from scrapy.http import Request cookies = parse_cookies(lines[1]) yield Request(url, cookies=cookies, dont_filter=True)
def __init__(self, plat_id=None, plat_name=None, need_token='0', formated_url=None, cache='cache', \ *args, **kwargs): self.plat_id = plat_id self.plat_name = plat_name self.need_token = bool(int(need_token)) self.start_formated_url = formated_url lines, total_page = read_cache('cache', cache+'.ch'), 0 if lines: total_page = int(lines[0]) self.shortlist = xrange(1, total_page+1) super(YuqiSpider, self).__init__(*args, **kwargs)
def __init__(self, plat_id=None, plat_name=None, need_token='0', formated_url=None, cache='cache', \ *args, **kwargs): self.plat_id = plat_id self.plat_name = plat_name self.need_token = bool(int(need_token)) self.start_formated_url = formated_url lines, total_page = read_cache('cache', cache + '.ch'), 0 if lines: total_page = int(lines[0]) self.shortlist = xrange(1, total_page + 1) super(YuqiSpider, self).__init__(*args, **kwargs)
def start_requests(self): if self.need_token: token = '' lines = read_cache('tokens', (self.plat_id or 'token') + '.tk') if self.need_token and lines: token = lines[0] timestamp = get_unix_time() signature = get_access_signature(token, timestamp, self.password, self.is_upper) body = { 'token': token, 'timestamp': timestamp, 'signature': signature, 'from_date': self.from_date, 'to_date': self.to_date, 'page_size': self.page_size, 'page_index': self.page_index } if self.is_json: yield Request(self.start_formated_url, body=json.dumps(body), method='POST') else: yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True) else: if self.method: yield scrapy.FormRequest( self.start_formated_url + self.formated_parameters.format(page_size=self.page_size, page_index=self.page_index, from_date=self.from_date, to_date=self.to_date), method='GET', dont_filter=True) else: body = { 'from_date': self.from_date, 'to_date': self.to_date, 'page_size': self.page_size, 'page_index': self.page_index } if self.is_json: yield Request(self.start_formated_url, body=json.dumps(body), method='POST') else: yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True)
def start_requests(self): if self.need_token: token = '' lines = read_cache('tokens', (self.plat_id or 'token') + '.tk') if self.need_token and lines: token = lines[0] timestamp = get_unix_time() signature = get_access_signature(token, timestamp, self.password, self.is_upper) for date in get_date_list(from_date=self.from_date, to_date=self.to_date, delimiter='-'): body = { 'token': token, 'timestamp': timestamp, 'signature': signature, 'date': date } if self.is_json: yield Request(self.start_formated_url, body=json.dumps(body), method='POST') else: yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True) else: if self.method: for date in get_date_list(from_date=self.from_date, to_date=self.to_date, delimiter='-'): yield scrapy.FormRequest( self.start_formated_url.format(date=date), method='GET', dont_filter=True) else: for date in get_date_list(from_date=self.from_date, to_date=self.to_date, delimiter='-'): body = {'date': date} if self.is_json: yield Request(self.start_formated_url, body=json.dumps(body), method='POST') else: yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True)
def start_requests(self): if self.need_token: token = '' lines = read_cache('tokens', (self.plat_id or 'token')+'.tk') if self.need_token and lines: token = lines[0] timestamp = get_unix_time() signature = get_access_signature(token, timestamp, self.password) body = {'token': token, 'timestamp': timestamp, 'signature': signature, 'month': self.month} yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True) else: if self.method: yield scrapy.FormRequest(self.start_formated_url.format(month=self.month), method='GET', dont_filter=True) else: body = {'month':self.month} yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True)
def start_requests(self): if self.need_token: token = '' lines = read_cache('tokens', (self.plat_id or 'token')+'.tk') if self.need_token and lines: token = lines[0] timestamp = get_unix_time() signature = get_access_signature(token, timestamp, self.password) body = {'token': token, 'timestamp': timestamp, 'signature': signature, 'from_date': self.from_date, 'to_date': self.to_date, 'page_size': self.page_size, 'page_index': self.page_index} yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True) else: if self.method: yield scrapy.FormRequest(self.start_formated_url+self.formated_parameters.format(page_size=self.page_size, page_index=self.page_index, from_date=self.from_date, to_date=self.to_date), method='GET', dont_filter=True) else: body = {'from_date': self.from_date,'to_date': self.to_date, 'page_size': self.page_size, 'page_index': self.page_index} yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True)
def start_requests(self): if self.need_token: token = '' lines = read_cache('tokens', (self.plat_id or 'token')+'.tk') if self.need_token and lines: token = lines[0] timestamp = get_unix_time() signature = get_access_signature(token, timestamp, self.password) for date in get_date_list(from_date=self.from_date, to_date=self.to_date, delimiter='-'): body = {'token': token, 'timestamp': timestamp, 'signature': signature, 'date': date} yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True) else: if self.method: for date in get_date_list(from_date=self.from_date, to_date=self.to_date, delimiter='-'): yield scrapy.FormRequest(self.start_formated_url.format(date=date), method='GET', dont_filter=True) else: for date in get_date_list(from_date=self.from_date, to_date=self.to_date, delimiter='-'): body = {'date': date} yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True)
def start_requests(self): if self.need_token: token = '' lines = read_cache('tokens', (self.plat_id or 'token') + '.tk') if self.need_token and lines: token = lines[0] timestamp = get_unix_time() signature = get_access_signature(token, timestamp, self.password, self.is_upper) body = { 'token': token, 'timestamp': timestamp, 'signature': signature, 'month': self.month } if self.is_json: yield Request(self.start_formated_url, body=json.dumps(body), method='POST') else: yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True) else: if self.method: yield scrapy.FormRequest( self.start_formated_url.format(month=self.month), method='GET', dont_filter=True) else: body = {'month': self.month} if self.is_json: yield Request(self.start_formated_url, body=json.dumps(body), method='POST') else: yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True)
def start_requests(self): if not (self.plat_id and self.time_from and self.time_to): return try: loans = JiekuanItem.django_model.objects.filter(plat_id=self.plat_id, status='1', \ success_time__gte=self.time_from, \ success_time__lte=self.time_to) except Exception as e: self.logger.info('Error From Filtering Loan Objects <%s>.' % e) return for jk in loans: if not jk.bid_id: continue token = '' lines = read_cache('tokens', (self.plat_id or 'test')+'.tk') if self.need_token and lines: token = lines[0] url = self.start_formated_url.format(id=jk.bid_id, token=token) from scrapy.http import Request cookies = parse_cookies(lines[1]) yield Request(url, cookies=cookies, dont_filter=True)
def start_requests(self): if not (self.plat_id and self.time_from and self.time_to): return try: loans = JiekuanItem.django_model.objects.filter(plat_id=self.plat_id, status='1', \ success_time__gte=self.time_from, \ success_time__lte=self.time_to) except Exception as e: self.logger.info('Error From Filtering Loan Objects <%s>.' % e) return for jk in loans: if not jk.bid_id: continue token = '' lines = read_cache('tokens', (self.plat_id or 'test') + '.tk') if self.need_token and lines: token = lines[0] url = self.start_formated_url.format(id=jk.bid_id, token=token) from scrapy.http import Request cookies = parse_cookies(lines[1]) yield Request(url, cookies=cookies, dont_filter=True)
def parse(self,response): #print "[url: %s || status: %s]"%(response.url,response.status) essay_urls=read_cache('news_cache',self.cache) driver=webdriver.PhantomJS() for essay_url in essay_urls: try: retitem=HujinzhentanExposureItem() driver.get(essay_url) resp=HR("",200,{},driver.page_source.encode("utf8"))#把抓回的内容封装为HtmlResponse只是利用HtmlResponse的XPATH而已 retitem["link"]=essay_url for i in essay_url.split("&"): if i.startswith("mid"): retitem["thread_id"]=i.split("=")[1][-9:] retitem["abstract"]="" retitem["title"]=resp.xpath("//h2[@class='rich_media_title']/text()").extract()[0].strip() retitem["date"]=resp.xpath("//em[@id='post-date']/text()").extract()[0] #根据日期再过滤一下 allparts=resp.xpath("//div[@id='js_content']/*") #retitem["raw_html_content"]=allparts.extract()[1:-1] raw_paras=[] paras=[] source_para="" img_urls=[] for i in allparts: s=i.xpath("string(.)").extract()[0].strip() if len(s)>0: if s.startswith(u"来源:"): source_para=s break paras.append(s) raw_paras.append(i.extract()) if i.xpath(".//img/@data-src").extract().__len__()>0: img_urls+=i.xpath(".//img/@data-src").extract() retitem["content"]="\n".join(paras) retitem["raw_html_content"]="".join(raw_paras) try: retitem["source"]=source_para.split(u":")[1] except BaseException: retitem["source"]=u"互金侦探" retitem["category"]="曝光" retitem["image_urls"]="#".join(img_urls) yield retitem except BaseException: continue