def parse(self,response): mysql = msyqlHelper() old = response.meta names = set(['上架感言!']) links = response.xpath(old['xpath']) j = 1 for link in links: name = link.xpath('text()').extract_first(); if name in names: continue; href = link.xpath('@href').extract_first(); url = urljoin(response.url,href) names.add(name) meta = {} meta['name'] = name meta['bid'] = old['bid'] meta['size'] = 0 meta['is_vip'] = 1 meta['prev_cid'] = 0 meta['next_cid'] = 0 meta['sequence'] = j j = j+1 self.logger.info('Parse url is %s', url) chapter_id = mysql.insert(meta); meta['chapter_id'] = chapter_id if old['other'] == True: meta['id'] = old['id']+href.replace('.html','') else: meta['id'] = old['id'] self.logger.info('chapter_id is ------------------%s',chapter_id) yield scrapy.Request(url,callback=self.parse2,meta=meta) mysql.close();
def parse(self,response): mysql = msyqlHelper() names = set(['上架感言!']) links = response.xpath(response.meta['linkpath']) j = 1 maxcid = 1 for link in links: name = link.xpath('text()').extract_first() if name in names: continue href = link.xpath('@href').extract_first() next_url = urljoin(response.url,href) names.add(name) meta = dict() meta['name'] = name meta['bid'] = response.meta['bid'] meta['size'] = 0 meta['is_vip'] = 1 if j == 1: meta['prev_cid'] = 0 else: meta['prev_cid'] = maxcid-1 meta['next_cid'] = maxcid+1 maxcid = maxcid+1 meta['sequence'] = j j = j+1 self.logger.info('Parse url is %s', next_url) chapter_id = mysql.insert(meta) meta['contentxpath'] = response.meta['contentxpath'] meta['id'] = chapter_id self.logger.info('Parse function called on dfsdfsd------------------') yield scrapy.Request(next_url,callback=self.parse_content,meta=meta) mysql.close()
def parse(self, response): mysql = msyqlHelper() data = dict() meta = dict() data['bid'] = response.meta['bid'] data['size'] = 0 data['is_vip'] = 0 data['name'] = response.xpath( '//div[@id="htmltimu"]/h2/span/text()').extract_first() str = response.xpath('//div[@id="chapterContent"]/p/text()').extract() str = filter(lambda s: s != '', str) newsttr = list(str) content = '\r\n'.join(newsttr) data['content'] = content data['size'] = len(content) data['sequence'] = response.meta['sequence'] data['prev_cid'] = 0 data['next_cid'] = 0 chapter_id = mysql.inseraAll(data) self.logger.info(data) mysql.close() if data['name'] == '第一千零九十章少年的奇怪行为': return href = response.xpath( '//a[contains(.//text(), "下一页")]/@href').extract_first() if href is None: return meta['bid'] = response.meta['bid'] meta['sequence'] = response.meta['sequence'] + 1 meta['last_name'] = response.meta['last_name'] next_url = urljoin(response.url, href) yield scrapy.Request(next_url, callback=self.parse, meta=meta)
def parse(self,response): mysql = msyqlHelper() old = response.meta names = set(['上架感言!']) links = response.xpath(old['xpath']) j = 1 for link in links: name = link.xpath('text()').extract_first(); if name in names: continue; href = link.xpath('@href').extract_first(); url = urljoin(response.url,href) names.add(name) meta = {} meta['name'] = name meta['bid'] = old['bid'] meta['size'] = 0 meta['is_vip'] = 1 meta['prev_cid'] = 0 meta['next_cid'] = 0 meta['sequence'] = j meta['contentxpath'] = old['contentxpath'] j = j+1 self.logger.info('Parse url is %s', url) chapter_id = mysql.insert(meta); #meta['id'] = old['id'] meta['id'] = chapter_id self.logger.info('Parse function called on dfsdfsd------------------') yield scrapy.Request(url,callback=self.parse2,meta=meta) mysql.close();
def parse(self, response): mysql = msyqlHelper() #txt = json.loads(response.body) #self.logger.info(txt) #return; chapters = response.xpath('//ul[@class="t-list"]/li[position()>86]/a') i = 300 cookies = { "Hm_lvt_b7a5349c0dc4d90da89e89cc58ee99da": 1523950381, "UserItem_HGREAD_7040": "%7b%22UserId%22%3a2699411%2c%22OpenId%22%3a%22951678%22%2c%22AccessToken%22%3a%229E8E5C106BF94A14BE3760E3A5D1483F%22%2c%22RefreshToken%22%3a%2255DB88B286834E26A4DC66776E770537%22%2c%22ExpiresIn%22%3a1200%2c%22NickName%22%3a%22jinmincc%22%2c%22QQNo%22%3a%22%22%2c%22EMail%22%3a%22%22%2c%22Gender%22%3anull%2c%22IntroSelf%22%3a%22%22%2c%22HeadImgUrl%22%3a%22%22%2c%22UDate%22%3a%22%5c%2fDate(-62135596800000)%5c%2f%22%2c%22NewGuid%22%3anull%2c%22SumAmount%22%3a978%2c%22GiveSumAmount%22%3a0%2c%22GiveLoseTime%22%3a%22%5c%2fDate(-62135596800000)%5c%2f%22%2c%22VipScore%22%3a0%2c%22VipLoseScore%22%3a0%2c%22VipGrowth%22%3a0%2c%22VipLoseTime%22%3a%22%5c%2fDate(-62135596800000)%5c%2f%22%2c%22SVipLoseTime%22%3a%22%5c%2fDate(-62135596800000)%5c%2f%22%2c%22IsVip%22%3afalse%2c%22IsSVip%22%3afalse%2c%22VipTag%22%3a%22VIP%22%2c%22ScoreLevel%22%3a0%2c%22GrowthLevel%22%3a0%2c%22VipDiscount%22%3a1.000%2c%22TpOpenId%22%3anull%2c%22TpExpiresTime%22%3a0%7d", "USER_INFO_ACCESS_TOKEN": "EcXxcJEdDjsPKbanpdVAApnGhUOfe2RFwP9PRlNa2owtfVn9CvkMDQ==", "USER_INFO_EXPIRES_IN": "BSQWOyR09yo=", "USER_INFO_REFRESH_TOKEN": "RzYHBqus6gbGAgWeBK9D2LGd0IDtcKpO5/2zUwFRI2daAc3oHqA1ww==", "Hm_lpvt_b7a5349c0dc4d90da89e89cc58ee99da": 1523950925 } #(`bid`, `name`,`sequence`,`size`,`is_vip`,`prev_cid`,`next_cid`,`recent_update_at`,`created_at`,`updated_at`) for chapter in chapters: link_info = chapter.xpath('@href').extract_first() name = chapter.xpath('text()').extract_first() link = link_info.split('/') #self.logger.info(link) headers = { "Referer": response.url, "Host": "www.hgread.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36" } meta = {} meta['bid'] = 45 meta['name'] = name meta['sequence'] = i meta['size'] = 0 meta['is_vip'] = 0 meta['prev_cid'] = 0 meta['next_cid'] = 0 chapter_id = mysql.insert(meta) meta['id'] = chapter_id i = i + 1 if i >= 500: break url_t = 'http://www.hgread.com/home/OBookApiAgent?action=Chapter&bookId=%s&chapterId=%s' % ( link[2], link[3][0:-5]) #self.logger.info(meta) #url_format = 'http://m.iyunyue.com/inter/ChapterService.aspx?cmd=getchaptercontent&from=&iswx=0&bookid=413088&chapterid=%s' % (chapter['chapterId'],) yield scrapy.Request(url_t, callback=self.parse_content, meta=meta, cookies=cookies, headers=headers) mysql.close()
def start_requests(self): msyql = msyqlHelper() for url in self.start_urls: book_name = url[0] link = url[1] bid = msyql.insertbook(book_name) meta = {} meta['bid'] = bid meta['xpath'] = url[2] meta['id'] = url[3] yield scrapy.Request(link, callback=self.parse, meta=meta) msyql.close()
def start_requests(self): msyql = msyqlHelper() for url in self.start_urls: link = url[1] bid = url[0] meta = {} meta['bid'] = bid meta['xpath'] = url[2] meta['id'] = url[3] #self.logger.info(' 111111111111111111111111111111111111111 ------------------') yield scrapy.Request(link, callback=self.parse, meta=meta) msyql.close()
def parse(self, response): mysql = msyqlHelper() names = set(['上架感言!']) links = response.xpath(response.meta['linkpath']) #self.logger.info('dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd') #self.logger.info(links) j = 1 maxcid = 1 for link in links: name = link.xpath('text()').extract_first() if name in names: continue href = link.xpath('@href').extract_first() next_url = urljoin(response.url, href) #names.add(name) meta = dict() meta['name'] = name meta['bid'] = response.meta['bid'] meta['size'] = 0 meta['is_vip'] = 1 if j == 1: meta['prev_cid'] = 0 else: meta['prev_cid'] = 0 meta['next_cid'] = 0 maxcid = maxcid + 1 meta['sequence'] = j j = j + 1 self.logger.info('Parse url is %s', next_url) chapter_id = mysql.insert(meta) meta['contentxpath'] = response.meta['contentxpath'] meta['id'] = chapter_id self.logger.info('next url is %s------------------' % next_url) headers = { "Referer": response.url, "Host": "www.vodtw.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36" } yield scrapy.Request(next_url, callback=self.parse_content, meta=meta, headers=headers) mysql.close()
def parse(self, response): mysql = msyqlHelper() old = response.meta names = set(['上架感言!']) links = response.xpath(old['xpath']) self.logger.info(response.status) #self.logger.info('111 111 11 11 111 url------------------%s',response.url) j = 1 for link in links: name = link.xpath('text()').extract_first() if name in names: continue href = link.xpath('@href').extract_first() url = urljoin(response.url, href) #name = name.strip() if name == '第四百三十五章 自作孽不可活 4': break names.add(name) meta = {} meta['name'] = name meta['bid'] = old['bid'] meta['size'] = 0 meta['is_vip'] = 1 meta['prev_cid'] = 0 meta['next_cid'] = 0 meta['sequence'] = j j = j + 1 self.logger.info('Parse url is %s', url) #chapter_id = mysql.insert(meta); #meta['chapter_id'] = chapter_id #meta['id'] = old['id']+href.replace('.html','') meta['id'] = old['id'] self.logger.info('bid is ------------------%s', meta['bid']) self.logger.info('name is -----name-------------%s', name) yield scrapy.Request(url, callback=self.parse2, meta=meta) mysql.close()
def parse(self, response): mysql = msyqlHelper() names = set(['上架感言!']) links = response.xpath(response.meta['linkpath']) self.logger.info(links) #3cookies = response.headers.getlist('Set-Cookie') cookies = { "UM_distinctid": "162a3e6a64e89-0843b576687bb1-2e06372c-51000-162a3e6a6511a9", "uvip": "faaf79da50e39d893598fd8fce28cc04", "wgid": "1", "tlid": "223", "qdi": "1395", "qp": "30025", "id": "495059", "name": "user495059", "names": "%22user495059%22", "contact": "%22%5Cu91d1%5Cu5999%5Cu5999%22", "pic": "495059.jpg", "v": "1", "code": "9c960aa1d4b92cfa9569f26edc2cf2aa", "phone_unbind": "1", "tuid": "30025", "PHPSESSID": "gaa04b81clg2qj6n3qoeavmp52", "pindao": "b", "bi": "204", "CNZZDATA1267452641": "772678074-1523166968-%7C1524820323", "Hm_lvt_589e8b9ebda178159870e84dcda2b999": "1524801203", "Hm_lpvt_589e8b9ebda178159870e84dcda2b999": "1524821433" } #headers = { #"Referer":"http://www.sxyj.net/Book_Read/bookId_4dc9650165c6405f9219947466176978/chapterId_465531889b8648c0a26ec775eeda2056.html" #} j = 1 #meta = dict() #meta['contentxpath'] = response.meta['contentxpath'] #yield scrapy.Request("http://www.sxyj.net/WebApi/Book/GetChapter?bookId=4dc9650165c6405f9219947466176978&chapterId=465531889b8648c0a26ec775eeda2056",callback=self.parse_content,meta=meta,cookies=cookies,headers=headers) #return; maxcid = 1 for link in links: name = link.xpath('text()').extract_first() if name in names: continue href = link.xpath('@href').extract_first() hrefArr = href.split('/') #BookId = hrefArr[2][7:] #ChapterIds = hrefArr[3][10:-5] next_url = urljoin(response.url, href) self.logger.info(name) meta = dict() meta['name'] = name #chapter = mysql.getByBidAndName(name,18) #content = chapter.get('content') #self.logger.info(chapter); #if content != '': # continue #self.logger.info("--------name is %s" % name) meta['bid'] = response.meta['bid'] meta['size'] = 0 meta['is_vip'] = 1 if j == 1: meta['prev_cid'] = 0 else: meta['prev_cid'] = 0 meta['next_cid'] = 0 self.logger.info( "-------enter-222-------------enter---2222-----------enter---222----" ) maxcid = maxcid + 1 meta['sequence'] = j j = j + 1 self.logger.info('----name-is:%s---url-is:%s' % (name, next_url)) headers = { "Referer": response.url, "Host": "m.bsread.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } self.logger.info('Parse url is %s', next_url) if j <= 11: continue chapter_id = mysql.insert(meta) #chapter_id = chapter.get('id') meta['contentxpath'] = response.meta['contentxpath'] meta['id'] = chapter_id self.logger.info( 'Parse function called on dfsdfsd------------------') #temp = "http://www.sxyj.net/WebApi/Book/GetChapter?bookId=%s&chapterId=%s" % (BookId,ChapterIds) #yield scrapy.Request(next_url,callback=self.parse_content,meta=meta,cookies=cookies,headers=headers) aid = href.split('/')[2][0:-5] formdata = {"act": "gView", "bid": '224', "aid": aid} self.logger.info(formdata) yield scrapy.FormRequest(url='https://m.bsread.com/wx/s.php', formdata=formdata, headers=headers, callback=self.parse_content2, cookies=cookies, meta=meta) mysql.close()
def parse(self, response): mysql = msyqlHelper() names = set() links = response.xpath(response.meta['linkpath']) self.logger.info(links) #3cookies = response.headers.getlist('Set-Cookie') cookies = { "jieqiVisitInfo": "jieqiUserLogin%3D1525764839%2CjieqiUserId%3D123189", "read_pagenum": "1", "jieqiWapPsize": "-11", "shuhai_history_": "%5B%7B%22aid%22%3A%2211817%22%2C%22cid%22%3A1230652%2C%22aname%22%3A%22%25CE%25D2%25C4%25C3%25CA%25B1%25B9%25E2%25BB%25BB%25C4%25E3%25D2%25BB%25CA%25C0%25B3%25D5%25C3%25D4%22%2C%22autname%22%3A%22%25CA%25A2%25C9%25D9%22%2C%22asort%22%3A%22%25CF%25D6%25B4%25FA%25D1%25D4%25C7%25E9%22%2C%22cname%22%3A%22%2B%25B5%25DA5%25D5%25C2%2B%25D7%25ED%25BE%25C6%25B6%25D4%25BF%25B9%22%2C%22siteid%22%3Anull%2C%22sortid%22%3A%22111%22%7D%2C%7B%22aid%22%3A%2213540%22%2C%22cid%22%3A2053135%2C%22aname%22%3A%22%25CE%25AA%25C4%25E3%25C4%25A8%25C8%25A5%25D2%25BB%25CA%25C0%25B3%25BE%25B0%25A3%22%2C%22autname%22%3A%22%25BE%25FD%25D6%25B9%25B9%25E9%22%2C%22asort%22%3A%22%25C7%25E0%25B4%25BA%25D0%25A3%25D4%25B0%22%2C%22cname%22%3A%22%2B%25B5%25DA050%25D5%25C2%2526nbsp%253B%2526nbsp%253B%25CB%25AF%25BE%25F5%25CA%25C7%25B8%25F6%25CE%25CA%25CC%25E2%22%2C%22siteid%22%3Anull%2C%22sortid%22%3A%22101%22%7D%5D", "PHPSESSID": "1ff197c95d71a9d38021cdf0ccff1508" } #headers = { #"Referer":"http://www.sxyj.net/Book_Read/bookId_4dc9650165c6405f9219947466176978/chapterId_465531889b8648c0a26ec775eeda2056.html" #} j = 317 #meta = dict() #meta['contentxpath'] = response.meta['contentxpath'] #yield scrapy.Request("http://www.sxyj.net/WebApi/Book/GetChapter?bookId=4dc9650165c6405f9219947466176978&chapterId=465531889b8648c0a26ec775eeda2056",callback=self.parse_content,meta=meta,cookies=cookies,headers=headers) #return; maxcid = 1 for link in links: name = link.xpath('text()').extract_first() href = link.xpath('@href').extract_first() hrefArr = href.split('/') #BookId = hrefArr[2][7:] #ChapterIds = hrefArr[3][10:-5] next_url = urljoin(response.url, href) self.logger.info(name) meta = dict() meta['name'] = name meta['bid'] = response.meta['bid'] meta['size'] = 0 meta['is_vip'] = 1 if j == 1: meta['prev_cid'] = 0 else: meta['prev_cid'] = 0 meta['next_cid'] = 0 self.logger.info( "-------enter-222-------------enter---2222-----------enter---222----" ) maxcid = maxcid + 1 meta['sequence'] = j j = j + 1 self.logger.info('----name-is:%s---url-is:%s' % (name, next_url)) headers = { "Referer": response.url, "Host": "yomeng.yunshuge.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } self.logger.info('Parse url is %s', next_url) chapter_id = mysql.insert(meta) #chapter_id = chapter.get('id') meta['contentxpath'] = response.meta['contentxpath'] meta['id'] = chapter_id self.logger.info( 'Parse function called on dfsdfsd------------------') #temp = "http://www.sxyj.net/WebApi/Book/GetChapter?bookId=%s&chapterId=%s" % (BookId,ChapterIds) yield scrapy.Request(next_url, callback=self.parse_content, meta=meta, cookies=cookies, headers=headers) #aid = href.split('/')[2][0:-5] break #formdata={"act":"gView","bid":'224',"aid":aid} #self.logger.info(formdata) #yield scrapy.FormRequest(url='https://m.bsread.com/wx/s.php',formdata=formdata,headers=headers,callback=self.parse_content2,cookies=cookies,meta=meta) mysql.close()