Esempio n. 1
0
    def start_requests(self):
        if self.cache:
            self.logger.info('Loading Exposure URLs From File %s.' % self.cache)
            self.start_urls = read_cache('cache', self.cache)

        for url in self.start_urls:
            yield self.make_requests_from_url(url)
Esempio n. 2
0
    def start_requests(self):
        if self.cache:
            self.logger.info('Loading New URLs From File %s.' % self.cache)
            self.start_urls = read_cache('cache', self.cache)

        #super(XinwenSpider, self).start_requests()
        for url in self.start_urls:
            yield self.make_requests_from_url(url)
Esempio n. 3
0
    def __init__(self,cache='news_cache',from_date=news.settings.START_DATE,to_date="2099-12-31",**kwargs):
        self.logger.debug(locals())
        self.cache=cache+".ch"
        self.from_date=from_date
        self.to_date=to_date
        self.__dict__.update(kwargs) ##important

        self.start_urls=read_cache('news_cache',self.cache)
Esempio n. 4
0
    def start_requests(self):
        if self.cache:
            self.logger.info("Loading New URLs From File %s." % self.cache)
            self.start_urls = read_cache("cache", self.cache)

        # super(XinwenSpider, self).start_requests()
        for url in self.start_urls:
            yield self.make_requests_from_url(url)
    def start_requests(self):
        token = ''
        lines = read_cache('tokens', (self.plat_id or 'test')+'.tk')
        if self.need_token and lines: token = lines[0]
        url = self.start_formated_url.format(token=token)

        from scrapy.http import Request
        cookies = parse_cookies(lines[1])
        yield Request(url, cookies=cookies)
Esempio n. 6
0
    def start_requests(self):
        token = ''
        lines = read_cache('tokens', (self.plat_id or 'test') + '.tk')
        if self.need_token and lines: token = lines[0]
        url = self.start_formated_url.format(token=token)

        from scrapy.http import Request
        cookies = parse_cookies(lines[1])
        yield Request(url, cookies=cookies)
Esempio n. 7
0
    def start_requests(self):
        for i in self.shortlist:
            token = ''
            lines = read_cache('tokens', (self.plat_id or 'test') + '.tk')

            if self.need_token and lines: token = lines[0]
            url = self.start_formated_url.format(token=token, page_index=i)

            from scrapy.http import Request
            cookies = parse_cookies(lines[1])
            yield Request(url, cookies=cookies, dont_filter=True)
Esempio n. 8
0
    def start_requests(self):
        for i in self.shortlist:
            token = ''
            lines = read_cache('tokens', (self.plat_id or 'test')+'.tk')

            if self.need_token and lines: token = lines[0]
            url = self.start_formated_url.format(token=token, page_index=i)

            from scrapy.http import Request
            cookies = parse_cookies(lines[1])
            yield Request(url, cookies=cookies, dont_filter=True)
Esempio n. 9
0
    def __init__(self, plat_id=None, plat_name=None, need_token='0', formated_url=None, cache='cache', \
                 *args, **kwargs):
        self.plat_id = plat_id
        self.plat_name = plat_name
        self.need_token = bool(int(need_token))
        self.start_formated_url = formated_url

        lines, total_page = read_cache('cache', cache+'.ch'), 0
        if lines: total_page = int(lines[0])
        self.shortlist = xrange(1, total_page+1)
        super(YuqiSpider, self).__init__(*args, **kwargs)
Esempio n. 10
0
    def __init__(self, plat_id=None, plat_name=None, need_token='0', formated_url=None, cache='cache', \
                 *args, **kwargs):
        self.plat_id = plat_id
        self.plat_name = plat_name
        self.need_token = bool(int(need_token))
        self.start_formated_url = formated_url

        lines, total_page = read_cache('cache', cache + '.ch'), 0
        if lines: total_page = int(lines[0])
        self.shortlist = xrange(1, total_page + 1)
        super(YuqiSpider, self).__init__(*args, **kwargs)
Esempio n. 11
0
    def start_requests(self):
        if self.need_token:
            token = ''
            lines = read_cache('tokens', (self.plat_id or 'token') + '.tk')

            if self.need_token and lines: token = lines[0]

            timestamp = get_unix_time()
            signature = get_access_signature(token, timestamp, self.password,
                                             self.is_upper)

            body = {
                'token': token,
                'timestamp': timestamp,
                'signature': signature,
                'from_date': self.from_date,
                'to_date': self.to_date,
                'page_size': self.page_size,
                'page_index': self.page_index
            }
            if self.is_json:
                yield Request(self.start_formated_url,
                              body=json.dumps(body),
                              method='POST')
            else:
                yield scrapy.FormRequest(self.start_formated_url,
                                         formdata=body,
                                         dont_filter=True)
        else:
            if self.method:
                yield scrapy.FormRequest(
                    self.start_formated_url +
                    self.formated_parameters.format(page_size=self.page_size,
                                                    page_index=self.page_index,
                                                    from_date=self.from_date,
                                                    to_date=self.to_date),
                    method='GET',
                    dont_filter=True)
            else:
                body = {
                    'from_date': self.from_date,
                    'to_date': self.to_date,
                    'page_size': self.page_size,
                    'page_index': self.page_index
                }
                if self.is_json:
                    yield Request(self.start_formated_url,
                                  body=json.dumps(body),
                                  method='POST')
                else:
                    yield scrapy.FormRequest(self.start_formated_url,
                                             formdata=body,
                                             dont_filter=True)
Esempio n. 12
0
    def start_requests(self):
        if self.need_token:
            token = ''
            lines = read_cache('tokens', (self.plat_id or 'token') + '.tk')

            if self.need_token and lines: token = lines[0]

            timestamp = get_unix_time()
            signature = get_access_signature(token, timestamp, self.password,
                                             self.is_upper)

            for date in get_date_list(from_date=self.from_date,
                                      to_date=self.to_date,
                                      delimiter='-'):
                body = {
                    'token': token,
                    'timestamp': timestamp,
                    'signature': signature,
                    'date': date
                }
                if self.is_json:
                    yield Request(self.start_formated_url,
                                  body=json.dumps(body),
                                  method='POST')
                else:
                    yield scrapy.FormRequest(self.start_formated_url,
                                             formdata=body,
                                             dont_filter=True)
        else:
            if self.method:
                for date in get_date_list(from_date=self.from_date,
                                          to_date=self.to_date,
                                          delimiter='-'):
                    yield scrapy.FormRequest(
                        self.start_formated_url.format(date=date),
                        method='GET',
                        dont_filter=True)
            else:
                for date in get_date_list(from_date=self.from_date,
                                          to_date=self.to_date,
                                          delimiter='-'):
                    body = {'date': date}
                    if self.is_json:
                        yield Request(self.start_formated_url,
                                      body=json.dumps(body),
                                      method='POST')
                    else:
                        yield scrapy.FormRequest(self.start_formated_url,
                                                 formdata=body,
                                                 dont_filter=True)
Esempio n. 13
0
    def start_requests(self):
        if self.need_token:
            token = ''
            lines = read_cache('tokens', (self.plat_id or 'token')+'.tk')
            if self.need_token and lines:
                token = lines[0]

            timestamp = get_unix_time()
            signature = get_access_signature(token, timestamp, self.password)
            body = {'token': token, 'timestamp': timestamp, 'signature': signature, 'month': self.month}

            yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True)
        else:
            if self.method:
                yield scrapy.FormRequest(self.start_formated_url.format(month=self.month), method='GET', dont_filter=True)
            else:
                body = {'month':self.month}
                yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True)
Esempio n. 14
0
    def start_requests(self):
        if self.need_token:
            token = ''
            lines = read_cache('tokens', (self.plat_id or 'token')+'.tk')

            if self.need_token and lines: token = lines[0]

            timestamp = get_unix_time()
            signature = get_access_signature(token, timestamp, self.password)

            body = {'token': token, 'timestamp': timestamp, 'signature': signature, 'from_date': self.from_date, 'to_date': self.to_date, 'page_size': self.page_size, 'page_index': self.page_index}

            yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True)
        else:
            if self.method:
                yield scrapy.FormRequest(self.start_formated_url+self.formated_parameters.format(page_size=self.page_size, page_index=self.page_index, from_date=self.from_date, to_date=self.to_date), method='GET', dont_filter=True)
            else:
                body = {'from_date': self.from_date,'to_date': self.to_date, 'page_size': self.page_size, 'page_index': self.page_index}
                yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True)
Esempio n. 15
0
    def start_requests(self):
        if self.need_token:
            token = ''
            lines = read_cache('tokens', (self.plat_id or 'token')+'.tk')

            if self.need_token and lines: token = lines[0]

            timestamp = get_unix_time()
            signature = get_access_signature(token, timestamp, self.password)

            for date in get_date_list(from_date=self.from_date, to_date=self.to_date, delimiter='-'):
                body = {'token': token, 'timestamp': timestamp, 'signature': signature, 'date': date}
                yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True)
        else:
            if self.method:
                for date in get_date_list(from_date=self.from_date, to_date=self.to_date, delimiter='-'):
                    yield scrapy.FormRequest(self.start_formated_url.format(date=date), method='GET', dont_filter=True)
            else:
                for date in get_date_list(from_date=self.from_date, to_date=self.to_date, delimiter='-'):
                    body = {'date': date}
                    yield scrapy.FormRequest(self.start_formated_url, formdata=body, dont_filter=True)
Esempio n. 16
0
    def start_requests(self):
        if self.need_token:
            token = ''
            lines = read_cache('tokens', (self.plat_id or 'token') + '.tk')
            if self.need_token and lines:
                token = lines[0]

            timestamp = get_unix_time()
            signature = get_access_signature(token, timestamp, self.password,
                                             self.is_upper)
            body = {
                'token': token,
                'timestamp': timestamp,
                'signature': signature,
                'month': self.month
            }
            if self.is_json:
                yield Request(self.start_formated_url,
                              body=json.dumps(body),
                              method='POST')
            else:
                yield scrapy.FormRequest(self.start_formated_url,
                                         formdata=body,
                                         dont_filter=True)
        else:
            if self.method:
                yield scrapy.FormRequest(
                    self.start_formated_url.format(month=self.month),
                    method='GET',
                    dont_filter=True)
            else:
                body = {'month': self.month}
                if self.is_json:
                    yield Request(self.start_formated_url,
                                  body=json.dumps(body),
                                  method='POST')
                else:
                    yield scrapy.FormRequest(self.start_formated_url,
                                             formdata=body,
                                             dont_filter=True)
Esempio n. 17
0
    def start_requests(self):
        if not (self.plat_id and self.time_from and self.time_to): return

        try:
            loans = JiekuanItem.django_model.objects.filter(plat_id=self.plat_id, status='1', \
                                                            success_time__gte=self.time_from, \
                                                            success_time__lte=self.time_to)
        except Exception as e:
            self.logger.info('Error From Filtering Loan Objects <%s>.' % e)
            return

        for jk in loans:
            if not jk.bid_id: continue
            token = ''
            lines = read_cache('tokens', (self.plat_id or 'test')+'.tk')

            if self.need_token and lines: token = lines[0]
            url = self.start_formated_url.format(id=jk.bid_id, token=token)

            from scrapy.http import Request
            cookies = parse_cookies(lines[1])
            yield Request(url, cookies=cookies, dont_filter=True)
Esempio n. 18
0
    def start_requests(self):
        if not (self.plat_id and self.time_from and self.time_to): return

        try:
            loans = JiekuanItem.django_model.objects.filter(plat_id=self.plat_id, status='1', \
                                                            success_time__gte=self.time_from, \
                                                            success_time__lte=self.time_to)
        except Exception as e:
            self.logger.info('Error From Filtering Loan Objects <%s>.' % e)
            return

        for jk in loans:
            if not jk.bid_id: continue
            token = ''
            lines = read_cache('tokens', (self.plat_id or 'test') + '.tk')

            if self.need_token and lines: token = lines[0]
            url = self.start_formated_url.format(id=jk.bid_id, token=token)

            from scrapy.http import Request
            cookies = parse_cookies(lines[1])
            yield Request(url, cookies=cookies, dont_filter=True)
Esempio n. 19
0
    def parse(self,response):
        #print "[url: %s || status: %s]"%(response.url,response.status)
        essay_urls=read_cache('news_cache',self.cache)
        driver=webdriver.PhantomJS()

        for essay_url in essay_urls:
            try:
                retitem=HujinzhentanExposureItem()

                driver.get(essay_url)
                resp=HR("",200,{},driver.page_source.encode("utf8"))#把抓回的内容封装为HtmlResponse只是利用HtmlResponse的XPATH而已

                retitem["link"]=essay_url

                for i in essay_url.split("&"):
                    if i.startswith("mid"):
                        retitem["thread_id"]=i.split("=")[1][-9:]

                retitem["abstract"]=""

                retitem["title"]=resp.xpath("//h2[@class='rich_media_title']/text()").extract()[0].strip()

                retitem["date"]=resp.xpath("//em[@id='post-date']/text()").extract()[0]


                #根据日期再过滤一下
                allparts=resp.xpath("//div[@id='js_content']/*")

                #retitem["raw_html_content"]=allparts.extract()[1:-1]

                raw_paras=[]
                paras=[]
                source_para=""
                img_urls=[]

                for i in allparts:
                    s=i.xpath("string(.)").extract()[0].strip()
                    if len(s)>0:
                        if s.startswith(u"来源:"):
                            source_para=s
                            break
                        paras.append(s)
                    
                    raw_paras.append(i.extract())

                    if i.xpath(".//img/@data-src").extract().__len__()>0:
                         img_urls+=i.xpath(".//img/@data-src").extract()

                retitem["content"]="\n".join(paras)
                retitem["raw_html_content"]="".join(raw_paras)

                try:
                    retitem["source"]=source_para.split(u":")[1]
                except BaseException:
                    retitem["source"]=u"互金侦探"


                retitem["category"]="曝光"

               
                retitem["image_urls"]="#".join(img_urls)
                

                yield retitem
            except BaseException:
                continue