def parse(self, response): res_sel = Selector(response) for sel in res_sel.xpath('//div[@class="v va"]'): item = VideocrawlItem() item['title'] = sel.xpath( 'div[@class="v-meta"]/div/a/text()').extract() item['desc'] = sel.xpath( 'div[@class="v-meta"]/div/a/text()').extract() item['video_url'] = sel.xpath( 'div[@class="v-meta"]/div/a/@href').extract() item['date'] = sel.xpath( 'div/div/span[@class="v-publishtime"]/text()').extract() item['date'][0] = date_cal(item['date'][0]) item['duration'] = sel.xpath( 'div/div/span[@class="v-time"]/text()').extract() item['classify'] = "01" yield Request(url=str(item['video_url'][0]), meta={'item': item}, callback=self.parse_classify) #进入下一页页面 links = response.xpath('//li[@class="next"]') for every_url in links: url = str(self.start_urls[0] + every_url.xpath('a/@href').extract()[0]) yield Request(url, callback=self.parse)
def parse(self, response): driver = webdriver.PhantomJS() driver.get(self.start_urls[0]) while True: #将滚轮滑到最下端 js = "var q=document.documentElement.scrollTop=10000" driver.execute_script(js) time.sleep(3) element = WebDriverWait(driver, 20).until( lambda x: x.find_element_by_xpath('//p[@class="mB5 lh14"]')) page_html = driver.page_source sell = etree.HTML(page_html) #TODO titles = sell.xpath('//li[@class="l pl27 pb15 dr_li"]') i = 0 while (i < len(titles)): item = VideocrawlItem() item['title'] = titles[i].xpath('p[1]/a/text()') item['desc'] = titles[i].xpath('p[1]/a/text()') item['video_url'] = titles[i].xpath('p[1]/a/@href') item['date'] = titles[i].xpath('p[2]/a[2]/span/text()') item['date'][0] = date_cal(item['date'][0]) item['duration'] = "null" item['classify'] = u'数码' i = i + 1 yield item try: next_page = driver.find_element_by_xpath( '//a[@class="a btnR"]') next_page.click() time.sleep(3) except: print "next_page is over" break
def parse(self, response): driver = webdriver.PhantomJS() driver.get(self.start_urls[0]) while True: page_html = driver.page_source selector = etree.HTML(page_html) sel = selector.xpath('//div[@class="site-piclist_pic"]') index = 0 while (index < len(sel)): item = VideocrawlItem() item['title'] = sel[index].xpath('a/@title') item['desc'] = sel[index].xpath('a/@title') item['video_url'] = sel[index].xpath('a/@href') if not item['video_url']: continue item['date'] = "null" item['duration'] = sel[index].xpath('a/div/div/span/text()') item['classify'] = u'综艺' index = index + 1 yield item try: next_page = driver.find_element_by_xpath( '//a[@data-key="down"]') next_page.click() time.sleep(5) except: print "next_page is over" break
def parse(self,response): time.sleep(5) sell=Selector(response) driver=webdriver.PhantomJS() driver.maximize_window() driver.get(self.start_urls[0]) try: js="var q=document.documentElement.scrollTop=10000" driver.execute_script(js) time.sleep(10) movis=driver.page_source sell=etree.HTML(movis) titles=sell.xpath('//div[@class="yk-col4 yk-pack p-list mb16"]') except: print "scoll errot" index=0 while (index<len(titles)): item=VideocrawlItem() item['title']=titles[index].xpath('div/a/@title') item['desc']=titles[index].xpath('div/a/@title') item['video_url']=titles[index].xpath('div/a/@href') if not item['video_url']: continue item['date']="null" item['duration']=titles[index].xpath('ul/li/span/span/text()') item['classify']=u'综艺' index=index+1 yield item
def parse(self, response): sel_res = Selector(response) driver = webdriver.PhantomJS() driver.maximize_window() driver.get(self.start_urls[0]) while True: time.sleep(5) for sel in sel_res.xpath('//li[@j-delegate="colitem"]'): item = VideocrawlItem() item['title'] = sel.xpath('div[1]/a/@data-title').extract() item['desc'] = sel.xpath('div[1]/a/@data-title').extract() item['video_url'] = sel.xpath('div[1]/a/@href').extract() if not item['video_url']: continue item['date'] = sel.xpath( 'div[2]/p[2]/span[2]/text()').extract() item['date'][0] = date_cal(item['date'][0]) item['duration'] = sel.xpath( 'div[1]/a/div/div/span/text()').extract() item['classify'] = u'综艺' yield item try: next_page = driver.find_element_by_xpath('//a[@class="a1"]') next_page.click() except: print "next page is over" break
def parse(self,response): temp=Selector(response) try: driver=webdriver.PhantomJS() except: print "driver error" driver.maximize_window() driver.get("http://www.soku.com/search_video/q_qww?f=1&kb=040200000000000__qww&") time.sleep(2) driver.find_element_by_id("headq").clear() driver.find_element_by_xpath('//input[@id="headq"]').send_keys(self.keys) driver.find_element_by_xpath('//button[@class="btn btn_search"]').click() time.sleep(3) while True: target = driver.find_element_by_xpath('//div[@class="about"]') driver.execute_script("arguments[0].scrollIntoView();", target) time.sleep(3) target = driver.find_element_by_xpath('//div[@class="sk_wrap"]') driver.execute_script("arguments[0].scrollIntoView();", target) time.sleep(3) target = driver.find_element_by_xpath('//div[@class="about"]') driver.execute_script("arguments[0].scrollIntoView();", target) time.sleep(3) movis=driver.page_source selector=etree.HTML(movis) sel=selector.xpath('//div[@class="v"]') index=0 while(index<len(sel)): item=VideocrawlItem() item['title']=sel[index].xpath('.//div[@class="v-link"]/a/@title') if not len(item['title']): index=index+1 continue if is_similar(item['title'][0]): index=index+1 continue item['desc']=sel[index].xpath('.//div[@class="v-link"]/a/@title') item['video_url']=sel[index].xpath('div[@class="v-link"]/a/@href') item['date']=sel[index].xpath('.//span[@class="r"]/text()') item['date'][0]=date_cal(item['date'][0]) item['duration']=sel[index].xpath('div/div/span[@class="v-time"]/text()') item['classify']='null' if not len(item['video_url']): index=index+1 continue url=str(item['video_url'][0]) item['classify']=self.classify(url) index=index+1 yield item try: next_page=driver.find_element_by_xpath('//li[@class="next"]/a') next_page.click() time.sleep(3) except: print "nexr page is over" break
def parse(self, response): temp = Selector(response) driver = webdriver.PhantomJS() driver.maximize_window() driver.get(self.start_urls[0]) time.sleep(2) driver.find_element_by_id("data-widget-searchword").clear() driver.find_element_by_xpath( '//input[@id="data-widget-searchword"]').send_keys(self.keys) driver.find_element_by_xpath('//input[@class="search_btn"]').click() time.sleep(3) while True: target = driver.find_element_by_xpath('//div[@class="qy_footer"]') driver.execute_script("arguments[0].scrollIntoView();", target) time.sleep(3) target = driver.find_element_by_xpath('//div[@class="logo_wrap"]') driver.execute_script("arguments[0].scrollIntoView();", target) time.sleep(3) target = driver.find_element_by_xpath('//div[@class="qy_footer"]') driver.execute_script("arguments[0].scrollIntoView();", target) movis = driver.page_source selector = etree.HTML(movis) sel = selector.xpath('//li[@class="list_item"]') index = 0 while (index < len(sel)): item = VideocrawlItem() item['title'] = sel[index].xpath('div/h3/a/@title') if not len(item['title']): index = index + 1 continue if is_similar(item['title'][0]): index = index + 1 continue item['desc'] = sel[index].xpath('div/h3/a/@title') item['video_url'] = sel[index].xpath('a/@href') item['date'] = sel[index].xpath( 'div/div/div/em[@class="result_info_desc"]/text()') if not item['date']: index = index + 1 continue item['date'][0] = date_cal(item['date'][0]) item['duration'] = sel[index].xpath('a/p/span/text()') item['classify'] = 'null' url = str(item['video_url'][0]) item['classify'] = self.classify(url) index = index + 1 yield item try: next_page = driver.find_element_by_xpath( '//a[@data-key="down"]') next_page.click() time.sleep(3) except: print "next_page is over" break
def parse(self, response): sell = Selector(response) for sel in sell.xpath('//span[@class="item item_half"]'): item = VideocrawlItem() item['title'] = sel.xpath('a/@title').extract() item['desc'] = sel.xpath('a/@title').extract() item['video_url'] = self.tengxun_url + sel.xpath( 'a/@href').extract()[0] item['date'] = "null" item['duration'] = "null" item['classify'] = u'动物萌宠' yield item
def parse(self, response): res_sel = Selector(response) for sel in res_sel.xpath('//div[@class="item"]'): item = VideocrawlItem() item['title'] = sel.xpath('div/div/@title').extract() item['desc'] = sel.xpath('div/div/@title').extract() item['video_url'] = sel.xpath('div/div/a/@href').extract() if not item['video_url']: continue item['date'] = "null" item['duration'] = sel.xpath( 'div/div/a/div/span[@class="c-time"]/span/text()').extract() item['classify'] = "" yield Request(url=str(item['video_url'][0]), meta={'item': item}, callback=self.parse_classify)
def parse(self, response): sell = Selector(response) for sel in sell.xpath('//div[@class="yk-row"]/div'): item = VideocrawlItem() item['title'] = sel.xpath('div/div/a/@title').extract() item['desc'] = sel.xpath('div/div/a/@title').extract() item['video_url'] = sel.xpath('div/div/a/@href').extract() item['date'] = "null" item['duration'] = sel.xpath( 'div/ul/li/span/span/text()').extract() item['classify'] = "01" yield Request(url=str(item['video_url'][0]), meta={'item': item}, callback=self.parse_classify) links = response.xpath('//li[@class="next"]') for every_url in links: url = str(every_url.xpath('a/@href').extract()[0]) yield Request(url, callback=self.parse)
def parse(self,response): driver=webdriver.PhantomJS() driver.get(self.start_urls[0]) element = WebDriverWait(driver, 20).until(lambda x :x.find_element_by_xpath('//li[@class="list_item"]')) page_html=driver.page_source sell =etree.HTML(page_html) titles=sell.xpath('//li[@class="list_item"]') i=0 while (i<len(titles)): item=VideocrawlItem() item['title']=titles[i].xpath('@data-title') if not item['title']: break item['desc']=titles[i].xpath('@data-title') item['video_url']=titles[i].xpath('a/@href') item['video_url'][0]=self.tengxun_url+str(item['video_url'][0]) item['date']="null" item['duration']=titles[i].xpath('a/div/div/span[@class="figure_info"]/text()') item['classify']=u'母婴育儿' i=i+1 yield item
def parse(self, response): temp = Selector(response) try: driver = webdriver.PhantomJS() except: print "driver error" driver.maximize_window() driver.get("http://search.bilibili.com/all?keyword=TFBOYS") time.sleep(2) driver.find_element_by_id("search-keyword").clear() driver.find_element_by_xpath( '//input[@id="search-keyword"]').send_keys(self.keys) driver.find_element_by_xpath('//div[@id="search-button"]').click() time.sleep(3) index = 0 page = 1 while True: #滑动到页面下方 target = driver.find_element_by_xpath('//a[@id="weixin"]') driver.execute_script("arguments[0].scrollIntoView();", target) time.sleep(3) #滑动到页面上方 target = driver.find_element_by_xpath('//div[@id="header-search"]') driver.execute_script("arguments[0].scrollIntoView();", target) time.sleep(3) #滑动到页面下方 target = driver.find_element_by_xpath('//a[@id="weixin"]') driver.execute_script("arguments[0].scrollIntoView();", target) time.sleep(3) page_html = driver.page_source selector = etree.HTML(page_html) sel = selector.xpath('//li[@class="video matrix "]') index = 0 while (index < len(sel)): item = VideocrawlItem() item['title'] = sel[index].xpath('a/@title') if not len(item['title'][0]): index = index + 1 continue if is_similar(item['title'][0]): index = index + 1 continue item['desc'] = sel[index].xpath('a/@title') item['video_url'] = sel[index].xpath('a/@href') item['date'] = sel[index].xpath( './/span[@class="so-icon time"]/text()')[1].replace( ' ', '').replace('\n', '').replace('\t', '') item['date'] = date_cal(item['date']) item['duration'] = sel[index].xpath('a/div/span/text()') item['duration'] = item['duration'][0].replace( ' ', '').replace('\n', '').replace('\t', '') item['classify'] = 'null' if not len(item['video_url']): index = index + 1 continue url = str(item['video_url'][0]) item['classify'] = self.classify(url) index = index + 1 yield item try: next_page = driver.find_element_by_xpath( '//a[@class="nextPage"]') next_page.click() time.sleep(3) except: print "nexr page is over" break
def parse(self, response): temp = Selector(response) try: driver = webdriver.PhantomJS() except: print "driver error" driver.maximize_window() driver.get("http://v.qq.com/x/search/?q=redis&stag=102&smartbox_ab=") time.sleep(2) driver.find_element_by_id("keywords").clear() driver.find_element_by_xpath('//input[@id="keywords"]').send_keys( self.keys) driver.find_element_by_xpath('//button[@class="search_btn"]').click() time.sleep(3) index = 0 page = 1 while True: #js="var q=document.documentElement.scrollTop=10000" #driver.execute_script(js) target = driver.find_element_by_xpath('//div[@class="footermenu"]') driver.execute_script("arguments[0].scrollIntoView();", target) time.sleep(3) target = driver.find_element_by_xpath('//div[@class="site_logo"]') driver.execute_script("arguments[0].scrollIntoView();", target) time.sleep(3) target = driver.find_element_by_xpath('//div[@class="footermenu"]') driver.execute_script("arguments[0].scrollIntoView();", target) #WebDriverWait(driver, 20).until(lambda x:x.find_element_by_xpath('//div[@log_prevsend="true"]')) time.sleep(3) movis = driver.page_source selector = etree.HTML(movis) sel = selector.xpath('//div[@class="result_item result_item_h"]') index = 0 while (index < len(sel)): item = VideocrawlItem() item['title'] = sel[index].xpath('h2/a/text()') if not len(item['title']): index = index + 1 continue if len(item['title']) > 1: item['title'][0] = item['title'][0] + str( self.keys) + item['title'][1] item['title'] = item['title'][:1] if is_similar(item['title'][0]): index = index + 1 continue item['desc'] = sel[index].xpath('h2/a/text()') item['video_url'] = sel[index].xpath('a/@href') item['date'] = sel[index].xpath( 'div/div/div/span[@class="content"]/text()') item['date'] = date_cal(item['date'][0]) item['duration'] = sel[index].xpath('a/span/span/text()') item['duration'][0] = item['duration'][0][2:] item['classify'] = 'null' if not len(item['video_url']): index = index + 1 continue url = str(item['video_url'][0]) #item['classify']=self.classify(url) index = index + 1 yield item try: next_page = driver.find_element_by_xpath( '//a[@class="page_next"]') next_page.click() time.sleep(3) except: print "nexr page is over" break