def generateTempitemForGet(self, response): parsed = urlparse.urlparse(response.url) parameters = urlparse.parse_qs(parsed.query) item = ProjectItem() url = parsed.geturl() if "?" in url: item['url'] = url[:url.find('?')] else: item['url'] = url item['param'] = parameters item['type'] = "GET" if self.login_required: item["loginrequired"] = "true" item["loginurl"] = self.login_url else: item["loginrequired"] = "false" item["loginurl"] = "" referer = None if "Referer" in response.request.headers.keys(): referer = response.request.headers["Referer"] item["headers"] = { "referer": referer, "user-agent": response.request.headers["User-Agent"] } return item
def parse(self, response): part = response.meta['part'] # inspect_response(response, self) for li in response.xpath('//ul[@class="for-list"]/li')[1:2]: item = ProjectItem() item['part'] = part item['title'] = li.xpath('div[@class="titlelink box"]/a[@class="truetit"]//text()').get().strip().replace( '\n', ' ') href = li.xpath('div[@class="titlelink box"]/a[@class="truetit"]/@href').get() item['href'] = 'https://bbs.hupu.com' + href item['id'] = re.search(r'\d+', href).group() item['author'] = li.xpath('div[@class="author box"]/a[@class="aulink"]/text()').get() item['reply_count'] = li.xpath('span[@class="ansour box"]/text()').get().split('\xa0')[0] item['view_count'] = li.xpath('span[@class="ansour box"]/text()').get().split('\xa0')[-1] link_count_text = li.xpath('div[@class="titlelink box"]/span[@class="light_r "]/a/@title').get() if link_count_text: link_count = re.search(r'\d+', link_count_text).group() else: link_count = 0 item['link_count'] = link_count item['comment'] = [] # yield item total_page = (int(item['reply_count']) - 1) // 20 + 1 print('{},{},{},{},{},{},{}, 总页数:{}'.format(item['id'], item['href'], item['title'], item['author'], item['reply_count'], item['view_count'], item['link_count'], total_page)) if item['reply_count'] != '0': yield scrapy.Request('https://m.hupu.com/api/bbs/replies/{}-1'.format(item['id']), meta={'item': item, 'total_page': total_page, 'page': 1}, callback=self.comment)
def parse_items(self, response): item = ProjectItem() item['url'] = response.url item['text'] = self.get_text(response) domain = self.get_domain(response.url) item['unique_id'] = self.domain_to_id[domain] item['depth'] = response.request.meta['depth'] # uses DepthMiddleware print("Depth: ", item['depth']) yield item
def parse(self, response): item = ProjectItem() item['Confirmed'] = response.xpath( '/html/body/section[1]/div/div[1]/div[1]/text()').extract_first() item['Suspected'] = response.xpath( '/html/body/section[1]/div/div[2]/div[1]/text()').extract_first() item['Healing'] = response.xpath( '/html/body/section[1]/div/div[3]/div[1]/text()').extract_first() item['Death'] = response.xpath( '/html/body/section[1]/div/div[4]/div[1]/text()').extract_first() item['Deadline'] = response.xpath( '/html/body/section[1]/p[1]/text()').extract_first()[3:] yield item
def parse_item(self, response): i = 0 for quote in response.css('article'): if i == 0: item = ProjectItem() file_url = quote.css( 'a[data-entity-type="file"]::attr(href)').get() file_url = response.urljoin(file_url) item['file_urls'] = [file_url] item['file_names'] = file_url.split("/")[-1] i += 1 yield item
def generateLoginItem(self, form_data, action): self.login_url = action ItemPost = ProjectItem() ItemPost["url"] = self.login_url output_form_data = {} for key in form_data.keys(): output_form_data[key] = [form_data[key]] ItemPost["param"] = output_form_data ItemPost["type"] = "POST" ItemPost["loginrequired"] = "false" ItemPost["loginurl"] = "" return ItemPost
def generate_login_item(self, form_data, action): self.login_url = action post_item = ProjectItem() post_item["url"] = self.login_url output_form_data = {} for key in form_data.keys(): output_form_data[key] = [form_data[key]] post_item["param"] = output_form_data post_item["type"] = "POST" post_item["loginrequired"] = "false" post_item["loginurl"] = "" return post_item
def parse(self, response): # title = scrapy.Field() #定义标题 # week = scrapy.Field() #定义星期 # img = scrapy.Field() #定义图片 # temp = scrapy.Field() #定义温度 # rain = scrapy.Field() #定义降雨 # wind = scrapy.Field() #定义风力 # 过滤出包含六天天气的html代码,后续在用于循环 sixday_detail = response.xpath('//div[@class="tqshow1"]') # 定义items,用于存储六天的天气信息 items = [] # 循环每天的天气信息,并提取指定信息 for day in sixday_detail: # 调用item.py的ProjectItem()类来实例化一个item item = ProjectItem() # 提取标题,并循环成str datetitle = '' for date in day.xpath('./h3//text()').extract(): datetitle += date item['title'] = datetitle # 提取星期,返回为list,加入[0]下标获得第一个值 item['week'] = day.xpath('./p//text()').extract()[0] # 提取图片,返回为list,加入[0]下标获得第一个值 item['img'] = day.xpath( './ul/li[@class="tqpng"]/img/@src').extract()[0] # 提取温度,并循环成str templist = '' for temprange in day.xpath('./ul/li[2]//text()').extract(): templist += temprange item['temp'] = templist # 提取降雨,返回为list,加入[0]下标获得第一个值 item['rain'] = day.xpath('./ul/li[3]//text()').extract()[0] # 提取风力,返回为list,加入[0]下标获得第一个值 item['wind'] = day.xpath('./ul/li[4]//text()').extract()[0] #把item存入items列表 items.append(item) print '----------------bye from spider------------------------' return items
def generate_post_item(self, post_form): post_item = ProjectItem() post_item["url"] = post_form["url"] post_item["param"] = post_form["fields"] post_item["type"] = "POST" if self.login_required: post_item["loginrequired"] = "true" post_item["loginurl"] = self.login_url else: post_item["loginrequired"] = "false" post_item["loginurl"] = "" if bool(post_item["param"]): return post_item return None
def start_requests(self): # for part, link in parts.items(): # open('results/{}.txt'.format(part), 'w').close() # for i in range(2, 3): # 爬前十页 # yield scrapy.Request('https://bbs.hupu.com/rockets-{}'.format(i), meta={'part': '火箭专区'}) # yield scrapy.Request('https://bbs.hupu.com/26073334.html', meta={'part': '火箭专区'}, callback=self.comment()) item = ProjectItem() item['id'] = '26073334' item['comment'] = [] item['part'] = '火箭专区' item['title'] = '【一图流】哈登压哨后撤三分命中' item['author'] = 'LonzoBa' item['reply_count'] = 0 item['link_count'] = 0 item['view_count'] = 0 yield scrapy.Request('https://m.hupu.com/api/bbs/replies/26073334-1', meta={'item': item, 'total_page': 1, 'page': 1}, callback=self.comment)
def generatePostItem(self, post_form): ItemPost = ProjectItem() ItemPost["url"] = post_form["url"] ItemPost["param"] = post_form["fields"] self.log(55555555555555555555555555555555555555555) self.log(post_form["isGet"]) if (post_form["isGet"]): ItemPost["type"] = "GET" else: ItemPost["type"] = "POST" if self.login_required: ItemPost["loginrequired"] = "true" ItemPost["loginurl"] = self.login_url else: ItemPost["loginrequired"] = "false" ItemPost["loginurl"] = "" if bool(ItemPost["param"]): return ItemPost return None
def generateTempitemForGetNoResp(self, response_url): parsed = urlparse.urlparse(response_url) parameters = urlparse.parse_qs(parsed.query) item = ProjectItem() url = parsed.geturl() if "?" in url: item['url'] = url[:url.find('?')] else: item['url'] = url item['param'] = parameters item['type'] = "GET" if self.login_required: item["loginrequired"] = "true" item["loginurl"] = self.login_url else: item["loginrequired"] = "false" item["loginurl"] = "" item["headers"] = {} return item
def parse_new_page(self, response): # print(response.text) link_address=''; content_word=''; title=response.css('.show_con_title ::text').extract()[0] data=response.css('.show_con_info em:nth-child(1)::text').extract()[0][-19:-1] extence=response.css('.show_con_info em:nth-child(3)::text').extract()[0] content=response.css('.show_con_box p::text').extract() link=response.css('.show_con_box a[href^="http"]::attr(href) ').extract() for link_item in link: if link_item.find(".doc")!= -1 or link_item.find(".xls")!=-1 or link_item.find(".xlsx")!= -1: link_address=link_address+link_item+';' for content_item in content: if content_item.find("报名方式")!=-1 or content_item.find("应聘方法")!=-1 or content_item.find("报名时间")!=-1 or content_item.find("资格条件")!=-1: content_word=content_word+content_item+' :' item=ProjectItem() item['title']=title item['data']=data item['extence'] = extence item['content'] = content_word item['link'] = link_address yield item
def parseHtml(self, response): # 每个职位的节点对象列表 baseList = response.xpath('//tr[@class="odd"] | //tr[@class="even"]') for base in baseList: item = ProjectItem() item["PositionName"] = base.xpath('./td[1]/a/text()').extract()[0] # 链接 item["PositionLink"] = base.xpath('./td[1]/a/@href').extract()[0] # 类别 item["PositionType"] = base.xpath('./td[2]/text()').extract() if item["PositionType"]: item["PositionType"] = item["PositionType"][0] else: item["PositionType"] = "无" # 人数 item["PositionNum"] = base.xpath('./td[3]/text()').extract()[0] # 地点 item["PositionAddress"] = base.xpath('./td[4]/text()').extract()[0] # 时间 item["PositionTime"] = base.xpath('./td[5]/text()').extract()[0] yield item
def parse(self, response): yield ProjectItem(name='updated_item', index=1, last=self.last) yield ProjectItem(name='new_item', index=1)
def parse(self, response): #提取每个response的数据 node_list = response.xpath("//div[@class='dw_table']/div[@class='el']") total_page_pig = response.xpath("//span[@class = 'td']/text()") #正则表达式提取页数 pattern1 = re.compile(r"(?<=共).+?(?=页,)") #我们在编译这段正则表达式 matcher1 = re.search(pattern1, r"" + str(total_page_pig)) #在源文本中搜索符合正则表达式的部分 total_page = (matcher1.group(0)) #页数 print(total_page) for node in node_list: # 新建一个item item = ProjectItem() print('进入!!') item['position_name'] = node.xpath("./p/span/a/@title").extract()[ 0] #返回list中的第一个元素 职位 item['posttion_place'] = node.xpath( "./span[1]/a/text()").extract()[0] #公司 item['posttion_led'] = node.xpath("./span[2]/text()").extract()[ 0] #地点 if len(node.xpath("./span[3]/text()")): item['position_department'] = node.xpath( "./span[3]/text()").extract()[0] #薪资 else: item['position_department'] = '空' item['positon_time'] = node.xpath("./span[4]/text()").extract()[ 0] #时间 item['position_info'] = node.xpath("./span[1]/a/@href").extract()[ 0] #网址 print('1') yield item # s = (response.xpath("//li[@class='bk']/a/@href")).extract()[0] # print('\n\n\n\n\n\n',type(s),'\n',s,'\n\n\n\n\n\n\n') # s = (response.xpath("//li[@class='bk']/a/@href")).extract()[1] # print('\n\n\n\n\n\n',type(s),'\n',s,'\n\n\n\n\n\n\n') #打印当前页数 print( '\n\n\n\n\n\n', str( response.xpath( "//div[@class = 'p_in']/ul/li[@class='on']/text()"). extract()[0]), '\n\n\n\n\n\n\n') # #判断当前页是不是最后一页 if str( response.xpath( "//div[@class = 'p_in']/ul/li[@class='on']/text()"). extract()[0]) != total_page: if str( response.xpath( "//div[@class = 'p_in']/ul/li[@class='on']/text()"). extract()[0]) == '1': url = ( response.xpath("//li[@class='bk']/a/@href")).extract()[0] yield scrapy.Request(url, callback=self.parse) else: url = ( response.xpath("//li[@class='bk']/a/@href")).extract()[1] # print('\n\n\n\n\n',url,'\n\n\n\n\n\n') yield scrapy.Request(url, callback=self.parse)