def parse_dir_contents(self, response): str1 = response.url.split("/")[3] filename = 'output11/' + str1 + '.html' with open(filename, 'wb') as f: f.write(response.body) hxs = HtmlXPathSelector(response) #extract the cost for new format HDcost1 = hxs.xpath('//*[@class="dv-button-inner"]/text()').extract() len1 = len(HDcost1) del HDcost1[0] for i in range(0, len1 - 1): var1 = HDcost1[i] var1 = var1.encode('utf-8') HDcost1[i] = var1 #extract the title for new format title1 = hxs.xpath('//*[@id="aiv-content-title"]/text()').extract() len1 = len(title1) for i in range(0, len1): var1 = title1[i] var1 = var1.encode('utf-8') var1 = var1.strip() title1[i] = var1 title1 = filter(None, title1) #extract the release year for new format relyear = hxs.xpath('//*[@class="release-year"]/text()').extract() relyear1 = relyear[0].encode('utf-8') relyear1 = relyear1.strip() #extrcat the time for new format times = hxs.xpath( '//*[@id="dv-dp-left-content"]/div[2]/div[2]/dl/dd[2]/text()' ).extract() time1 = times[0].strip() time1 = time1.encode('utf-8') #extract the director for new format dir1 = response.xpath( '//*[@id="dv-center-features"]/div[1]/div/table/tr[2]/td/a/text()' ).extract() dir1 = dir1[0].encode('utf-8') dir1 = dir1.strip() #extract the starring actors actors = hxs.select( '//*[@id="dv-dp-left-content"]/div[2]/div[2]/dl/dd[1]/text()' ).extract() actors = actors[0].encode('utf-8') actors = actors.strip() yield DmozItem( title=title1, time=time1, cost=HDcost1, year=relyear1, director=dir1, star=actors, )
def parse(self, response): for sel in response.xpath('//ul/li'): item = DmozItem() item['title'] = sel.xpath('a/text()').extract() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('text()').extract() yield item
def parse(self, response): filename = response.url.split('/')[-2] with open(filename, 'wb') as f: f.write(response.body) #build response file sel = scrapy.selector.Selector(response) items = [] title = sel.xpath('//div[@class="site-title"]/text()').extract() link = sel.xpath('//div[@class="title-and-desc"]/a/@href').extract() desc = sel.xpath('//div[@class="site-descr "]/text()').extract() desclist = [] for i in range(len(desc)): if i % 2 == 0: desclist.append(desc[i]) for j in range(len(title)): item = DmozItem() #instantiation item['title'] = title[j] item['link'] = link[j] item['desc'] = desclist[j] items.append(item) return items '''titles = sel.xpath('//div[@class="site-title"]/text()').extract()
def parse_item2(self,response): l = ItemLoader(item=DmozItem(), response=response) l.add_xpath('type','//div[@class="location ask_main_location"]/span[@class="fl"]/a[last()]/text()') l.add_xpath('type','//div[@class="question"]/h2/text()') l.add_xpath('answer','//div[@class="anwser"]/h2/text()') l.add_value('answer','牛逼') yield l.load_item()
def parse(self, response): sel = Selector(response) # sites = sel.xpath('//div[@class="name"]/a') sites = sel.css('div.product-grid > div') items = [] for site in sites: item = DmozItem() title = site.css('div.name > a::text').extract()[0] link = site.css('div.name > a::attr("href")').extract()[0] des = site.css('div.description::text').extract()[0] price = site.css('div.price::text').extract()[0].replace(' ','').replace('\n','').replace('\r','') item['title'] = title item['link'] = link # item['desc'] = des item['price'] = price items.append(item) yield http.Request(url=item["link"], meta={'item': item}, callback=self.parseDetail, dont_filter=True) # yield item nextPage = sel.xpath('//div[@class="links"]/a/@href').extract()[-2] if nextPage: next = nextPage yield http.Request(next, callback=self.parse)
def start_requests(self): for i in range(3618, len(self.next_url)): item = DmozItem() item['tencent_index'] = i + 1 yield scrapy.Request(self.next_url[i], meta={'item': item}, callback=self.parse, dont_filter=True)
def parse(self, response): for sel in response.xpath('//div[@class="title-and-desc"]'): item = DmozItem() item['title'] = sel.xpath('a/div/text()').extract() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('div/text()').extract() yield item '''
def parse_tag(self, response): # print response.url urllk = urllib.unquote(response.url.replace("\\x", "%")) clsname = urllk.split('/')[-1] filename = "tag_app_" + clsname f = open(filename, "w") rx1 = ur"data-install=\"(.*?)\".*data-name=\"(.*?)\".*data-pn=\"(.*?)\"" rx2 = ur"<li class=\"(.*?)\" data-pn=\"(.*?)\"" driver = webdriver.Chrome(os.environ['webdriver.chrome.driver']) driver.get(response.url) for i in range(1, 100): try: driver.find_element_by_id("j-refresh-btn").click() time.sleep(1) except: break sou = driver.page_source cnt = 0 name = "" hxs = Selector(text=sou) for sel in hxs.xpath('//*[@id="j-tag-list"]/li'): item = DmozItem() item['cls'] = clsname print sel try: if len(sel.xpath('a[@class="install-btn"]').extract()) > 0: ma1 = re.search( rx1, sel.xpath('a[@class="install-btn"]').extract()[0]) elif len(sel.xpath('a[@class="install-btn "]').extract()) > 0: ma1 = re.search( rx1, sel.xpath('a[@class="install-btn "]').extract()[0]) else: continue if ma1: cnt += 1 #print "install:",ma1.group(1), "name:", ma1.group(2), " pn:", ma1.group(3) item['pn'] = ma1.group(3) item['isc'] = ma1.group(1) name = ma1.group(2) #print item['pn'], " | ", item['cls'][0] data = u' '.join( (item['pn'], item['isc'])).encode('utf-8').strip() #print data f.write(data) f.write(u'\n') else: print 'not found!' continue except: continue data = u"total found %d packages\n" % cnt print "cate:", clsname, " total:", data, "last app:", name f.write(data) driver.close()
def parse(self, response): sel = Selector(response) sites = sel.xpath('//body') items = [] for site in sites: item = DmozItem() item['content'] = site.xpath('//body/h1/text()').extract() items.append(item) return items
def parse(self, response): for sel in response.xpath( '//*[@id="block-system-main"]/article/div[7]/div[1]/table/tbody/tr' ): item = DmozItem() item['Mashup_Name'] = sel.xpath('td[1]/a/text()').extract() item['Description'] = sel.xpath('td[2]/text()').extract() item['Category'] = sel.xpath('td[3]/a/text()').extract() yield item
def parse(self, response): for sel in response.xpath('//*[@id="mainContent"]/div/div[*]/div[2]'): print("惺惺惜惺惺想寻寻寻寻寻寻寻寻寻") item = DmozItem() item['title'] = sel.xpath('a/text()').extract() item['link'] = sel.xpath('a/@href').extract() yield item
def page_2(self, response): item = DmozItem() items = [] item['name'] = response.xpath( '//div[@class="substance"]/text()').extract() item['desc'] = response.xpath( '//div[@class="report-text-surround"]/text()').extract() items.append(item) return (items)
def parse(self, response): t = response.xpath('//title/text()').extract() d = response.xpath('//meta[@name="description"]/@content').extract() l = response.xpath('//link[@rel="dns-prefetch"]/@href').extract() item = DmozItem() item['title'] = t item['desc'] = d item['link'] = l yield item
def parse(self, response): for sel in response.xpath('//div[@class="title-and-desc"]'): item = DmozItem() item['title'] = sel.xpath( 'a/div[@class="site-title"]/text()').extract() #.strip() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('div/text()').extract() #.strip() #print(title,link,desc) yield item
def parse(self, response): for sel in response.xpath('//div[@class="clsShow"]/div'): item = DmozItem() item['price'] = sel.xpath('div[@class="gPrice"]/text()').extract() item['address'] = sel.xpath( 'div[@class="gAddress"]/text()').extract() item['name'] = sel.xpath('div[@class="gStatn"]/a/text()').extract() log.msg(item['name']) yield item
def parse(self, response): # chapther 1 css # print response.body # print response.css('meta::attr(content)')[0].extract() #CSS 伪元素 (Pseudo-elements) # for jscript in response.css('script::text'): # print jscript.extract() # # for metalabel in response.css('meta::attr(content)'): # print metalabel.extract() #chapter 2 selecter xpath # sel=Selector(text=response.body,type="html") # for labelp in sel.xpath('//div[@class="RichContent-inner"]/span/p/text()'): # print labelp.extract().encode('utf-8') # for labela in sel.xpath('//a'): # item=DmozItem() # item['title']=labela.xpath('@class').extract() # item['link']=labela.xpath('@href').extract() # yield item #使用beautifulSoup解析html soup = BeautifulSoup(response.body, "lxml") # print soup.body.a #获取第一个tag # print soup.body.a.contents[0].name #获取所有a标签 # for lablesvg in soup.find_all('svg'): # print lablesvg; # print soup.p.string # print type(soup.p.string) # for labelp in soup.find_all('p'): # print labelp.string # for str in labelp.stripped_strings: # print str # BeautifulSoup 对象包含了一个值为 “[document]” 的特殊属性 .name # print soup.name #find_all # print soup.find_all('img')[0]['src'] # print soup.find_all(role='navigation')[0].a # print soup.find_all(role='navigation')[0].a.attrs # print soup.find_all(role='navigation')[0].a['class'] # print soup.find_all(role='navigation')[0].a.string #rules for img in soup.find_all('img'): item = DmozItem() item['link'] = img['src'] yield item for url in response.xpath('//a/@href').extract(): if (url.find('http') == -1): url = 'https://www.zhihu.com' + url yield scrapy.Request(url, callback=self.parse)
def parse_page2(self, response): for sel in response.css('.site-item '): item = DmozItem() item['title'] = sel.xpath('//*[@class="title-and-desc"]/a/div[@class="site-title"]/text()') \ .extract_first().strip() item['link'] = sel.xpath('//*[@class="title-and-desc"]/a/@href' ).extract_first().strip() item['desc'] = sel.css( '.site-descr ::text').extract_first().strip() yield item
def parse(self, response): #necessary # filename = response.url.split("/")[-2]#最后一个是“,所以倒数第二个分别是Books 和 Resources # with open(filename, 'wb') as f: # f.write(response.body) for sel in response.xpath('//ul/li'): item = DmozItem() # 自定义的字典 item['title'] = sel.xpath('a/text()').extract() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('text()').extract() yield item
def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select('//div[@class="griditem"]') items = [] for each in sites: item = DmozItem() item['title'] = each.select('h2[@class="title"]').extract()[0] item['link'] = each.select('div[@class="oper clear-fix shoptext"]').extract()[0] items.append(item) return items
def parse_item(self, response): sel = Selector(response) items = [] item = DmozItem() title = sel.xpath('//*[@id="wrap"]/div[2]/div[1]/div[1]/div[1]/text()').extract() desc1 = sel.xpath('//*[@id="wrap"]/div[2]/div[1]/div[3]/div[2]/div[1]/text()').extract() item['title'] = [t.encode('utf-8') for t in title] item['desc1'] = [d.encode('utf-8') for d in desc1] items.append(item) return items
def parse_item(self, response): items = [] item = DmozItem() item['title'] = response.xpath('//title/text()').extract() item['desc'] = response.xpath('//meta[name="description"]').extract() item['link'] = response.url print item items.append(item) return items
def parse_review(self, response): # https://manga.mipcdn.com/i/s/https://mhimg.eshanyao.com/ManHuaKu/y/yirenzhixia/1jiejie1/2019300523.jpg try: item = DmozItem() # 图片链接 item['link'] = response.xpath( '//div[@id="images"]/image/@src').extract() yield item except Exception as error: log(error)
def parse(self, response): # filename = response.url.split("/")[-2] # with open(filename, "wb") as f: # f.write(response.body) for sel in response.xpath('//div[@class="results browse-content"]//div[@class="site-item "]/div[@class="title-and-desc"]'): item = DmozItem() item['title'] = sel.xpath('a/div[@class="site-title"]/text()').extract() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('div[@class="site-descr "]/text()').extract() yield item
def parse(self, response): # filename = response.url.split("/")[-2] + '.html' # with open(filename, 'wb') as f: # f.write(response.body) for sel in response.xpath('//ul/li'): item = DmozItem() item['title'] = sel.xpath('a/text()').extract() item['link'] = sel.xpath('a/@href').extract() item['desc'] = sel.xpath('text()').extract() yield item
def parse_old(self, response): for sel in response.xpath('//ul[@class="mulu_list"]/li'): item = DmozItem() item['title'] = sel.xpath('a/text()').extract() item['link'] = sel.xpath('a/@href').extract() str = item['title'][0] href = item['link'][0] url = response.urljoin(href) print url.encode('utf8') yield item
def parse(self, response): sel = Selector(response) sites = sel.xpath('//div') items = [] for site in sites: item = DmozItem() item['title'] = site.xpath('img/@src').extract() items.append(item) return items
def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.select('..//div[@class=" orddata"]/ul') items = [] f = open('texto.txt', 'w') for site in sites: item = DmozItem() item['titleVaga'] = ''.join(site.select('li//h2/text()').extract()) item['link'] = ''.join(site.select('li/a[@class="vagaTitle"]/@href').extract()) item['desc'] = ''.join(site.select('li[@class="vagaDesc"]/text()').extract()) item['location'] = ''.join(site.select('li[@class="location2"]/text()').extract()) item['area'] = ''.join(site.select('li[@class="area "]/span/text()').extract()) items.append(item) #print('\n\n\n\n++++++++') #print('\n\n\n\n++++++++') f.write(', '.join(item.values()).encode('utf-8')) f.close() return items
def parse(self, response): sel = Selector(response) sites = sel.xpath('//div[@id="righ_list"]/ul/li') items = [] for site in sites: item = DmozItem() item['title'] = site.xpath('a/text()').extract() item['link'] = site.xpath('a/@href').extract() item['desc'] = site.xpath('text()').extract() items.append(item) return items
def parse(self, response): test = HtmlXPathSelector(response) # sites = test.xpath("//div[@class='subjects-wrapper clearfix']") sites = test.xpath("//ul[@class='tlst clearfix']/li[@class='ilst']") for site in sites: item = DmozItem() item['title'] = site.xpath('a/@title').extract() # item['link'] = site.xpath('a/@href').extract() # item['desc'] = site.xpath('text()').extract() # items.append(item) return item
def parse_url2(self, response): item = DmozItem() # 实例化一个item selector = Selector(response) # 构造一个选择器 title = selector.xpath("//div[@class='title']/h1/text()").extract()[ 0] # 标题 content = selector.xpath( "//div[@id='content']//text()").extract() # 内容 # item['article_url'] = response.url # item['article_title'] = title # item['article_content'] = "".join(content) yield item
def parse(self, response): aside_nodes = response.xpath('//aside') for aside_node in aside_nodes: item = DmozItem() top_cat = aside_node.xpath('.//h2//a/text()').extract() sub_cat = aside_node.xpath('.//h3//a/text()').extract() item['top_cat'] = top_cat item['sub_cat'] = sub_cat yield item