Ejemplo n.º 1
0
    def generateTempitemForGet(self, response):
        parsed = urlparse.urlparse(response.url)
        parameters = urlparse.parse_qs(parsed.query)

        item = ProjectItem()
        url = parsed.geturl()
        if "?" in url:
            item['url'] = url[:url.find('?')]
        else:
            item['url'] = url

        item['param'] = parameters
        item['type'] = "GET"
        if self.login_required:
            item["loginrequired"] = "true"
            item["loginurl"] = self.login_url
        else:
            item["loginrequired"] = "false"
            item["loginurl"] = ""

        referer = None
        if "Referer" in response.request.headers.keys():
            referer = response.request.headers["Referer"]
        item["headers"] = {
            "referer": referer,
            "user-agent": response.request.headers["User-Agent"]
        }
        return item
Ejemplo n.º 2
0
    def parse(self, response):
        part = response.meta['part']
        # inspect_response(response, self)
        for li in response.xpath('//ul[@class="for-list"]/li')[1:2]:
            item = ProjectItem()
            item['part'] = part
            item['title'] = li.xpath('div[@class="titlelink box"]/a[@class="truetit"]//text()').get().strip().replace(
                '\n', ' ')
            href = li.xpath('div[@class="titlelink box"]/a[@class="truetit"]/@href').get()
            item['href'] = 'https://bbs.hupu.com' + href
            item['id'] = re.search(r'\d+', href).group()
            item['author'] = li.xpath('div[@class="author box"]/a[@class="aulink"]/text()').get()
            item['reply_count'] = li.xpath('span[@class="ansour box"]/text()').get().split('\xa0')[0]
            item['view_count'] = li.xpath('span[@class="ansour box"]/text()').get().split('\xa0')[-1]

            link_count_text = li.xpath('div[@class="titlelink box"]/span[@class="light_r  "]/a/@title').get()
            if link_count_text:
                link_count = re.search(r'\d+', link_count_text).group()
            else:
                link_count = 0
            item['link_count'] = link_count
            item['comment'] = []
            # yield item
            total_page = (int(item['reply_count']) - 1) // 20 + 1
            print('{},{},{},{},{},{},{}, 总页数:{}'.format(item['id'], item['href'], item['title'], item['author'],
                                                        item['reply_count'], item['view_count'], item['link_count'],
                                                        total_page))

            if item['reply_count'] != '0':
                yield scrapy.Request('https://m.hupu.com/api/bbs/replies/{}-1'.format(item['id']),
                                     meta={'item': item, 'total_page': total_page, 'page': 1},
                                     callback=self.comment)
Ejemplo n.º 3
0
    def parse_items(self, response):

        item = ProjectItem()
        item['url'] = response.url
        item['text'] = self.get_text(response)
        domain = self.get_domain(response.url)

        item['unique_id'] = self.domain_to_id[domain]
        item['depth'] = response.request.meta['depth']  # uses DepthMiddleware
        print("Depth: ", item['depth'])

        yield item
Ejemplo n.º 4
0
 def parse(self, response):
     item = ProjectItem()
     item['Confirmed'] = response.xpath(
         '/html/body/section[1]/div/div[1]/div[1]/text()').extract_first()
     item['Suspected'] = response.xpath(
         '/html/body/section[1]/div/div[2]/div[1]/text()').extract_first()
     item['Healing'] = response.xpath(
         '/html/body/section[1]/div/div[3]/div[1]/text()').extract_first()
     item['Death'] = response.xpath(
         '/html/body/section[1]/div/div[4]/div[1]/text()').extract_first()
     item['Deadline'] = response.xpath(
         '/html/body/section[1]/p[1]/text()').extract_first()[3:]
     yield item
Ejemplo n.º 5
0
    def parse_item(self, response):
        i = 0
        for quote in response.css('article'):
            if i == 0:
                item = ProjectItem()
                file_url = quote.css(
                    'a[data-entity-type="file"]::attr(href)').get()
                file_url = response.urljoin(file_url)
                item['file_urls'] = [file_url]
                item['file_names'] = file_url.split("/")[-1]
                i += 1

            yield item
Ejemplo n.º 6
0
    def generateLoginItem(self, form_data, action):
        self.login_url = action
        ItemPost = ProjectItem()
        ItemPost["url"] = self.login_url

        output_form_data = {}
        for key in form_data.keys():
            output_form_data[key] = [form_data[key]]
        ItemPost["param"] = output_form_data

        ItemPost["type"] = "POST"
        ItemPost["loginrequired"] = "false"
        ItemPost["loginurl"] = ""
        return ItemPost
Ejemplo n.º 7
0
    def generate_login_item(self, form_data, action):
        self.login_url = action
        post_item = ProjectItem()
        post_item["url"] = self.login_url

        output_form_data = {}
        for key in form_data.keys():
            output_form_data[key] = [form_data[key]]
        post_item["param"] = output_form_data

        post_item["type"] = "POST"
        post_item["loginrequired"] = "false"
        post_item["loginurl"] = ""
        return post_item
Ejemplo n.º 8
0
    def parse(self, response):

        # title = scrapy.Field()  #定义标题
        # week = scrapy.Field()   #定义星期
        # img = scrapy.Field()    #定义图片
        # temp = scrapy.Field()   #定义温度
        # rain = scrapy.Field()   #定义降雨
        # wind = scrapy.Field()   #定义风力

        # 过滤出包含六天天气的html代码,后续在用于循环
        sixday_detail = response.xpath('//div[@class="tqshow1"]')
        # 定义items,用于存储六天的天气信息
        items = []
        # 循环每天的天气信息,并提取指定信息
        for day in sixday_detail:
            # 调用item.py的ProjectItem()类来实例化一个item
            item = ProjectItem()

            # 提取标题,并循环成str
            datetitle = ''
            for date in day.xpath('./h3//text()').extract():
                datetitle += date

            item['title'] = datetitle

            # 提取星期,返回为list,加入[0]下标获得第一个值
            item['week'] = day.xpath('./p//text()').extract()[0]

            # 提取图片,返回为list,加入[0]下标获得第一个值
            item['img'] = day.xpath(
                './ul/li[@class="tqpng"]/img/@src').extract()[0]

            # 提取温度,并循环成str
            templist = ''
            for temprange in day.xpath('./ul/li[2]//text()').extract():
                templist += temprange
            item['temp'] = templist

            # 提取降雨,返回为list,加入[0]下标获得第一个值
            item['rain'] = day.xpath('./ul/li[3]//text()').extract()[0]

            # 提取风力,返回为list,加入[0]下标获得第一个值
            item['wind'] = day.xpath('./ul/li[4]//text()').extract()[0]

            #把item存入items列表
            items.append(item)

        print '----------------bye from spider------------------------'
        return items
Ejemplo n.º 9
0
    def generate_post_item(self, post_form):
        post_item = ProjectItem()
        post_item["url"] = post_form["url"]
        post_item["param"] = post_form["fields"]
        post_item["type"] = "POST"
        if self.login_required:
            post_item["loginrequired"] = "true"
            post_item["loginurl"] = self.login_url
        else:
            post_item["loginrequired"] = "false"
            post_item["loginurl"] = ""

        if bool(post_item["param"]):
            return post_item
        return None
Ejemplo n.º 10
0
    def start_requests(self):

        # for part, link in parts.items():
        # open('results/{}.txt'.format(part), 'w').close()
        # for i in range(2, 3):  # 爬前十页
        #     yield scrapy.Request('https://bbs.hupu.com/rockets-{}'.format(i), meta={'part': '火箭专区'})
        # yield scrapy.Request('https://bbs.hupu.com/26073334.html', meta={'part': '火箭专区'}, callback=self.comment())
        item = ProjectItem()
        item['id'] = '26073334'
        item['comment'] = []
        item['part'] = '火箭专区'
        item['title'] = '【一图流】哈登压哨后撤三分命中'
        item['author'] = 'LonzoBa'
        item['reply_count'] = 0
        item['link_count'] = 0
        item['view_count'] = 0
        yield scrapy.Request('https://m.hupu.com/api/bbs/replies/26073334-1',
                             meta={'item': item, 'total_page': 1, 'page': 1},
                             callback=self.comment)
Ejemplo n.º 11
0
    def generatePostItem(self, post_form):
        ItemPost = ProjectItem()
        ItemPost["url"] = post_form["url"]
        ItemPost["param"] = post_form["fields"]
        self.log(55555555555555555555555555555555555555555)
        self.log(post_form["isGet"])
        if (post_form["isGet"]):
            ItemPost["type"] = "GET"
        else:
            ItemPost["type"] = "POST"
        if self.login_required:
            ItemPost["loginrequired"] = "true"
            ItemPost["loginurl"] = self.login_url
        else:
            ItemPost["loginrequired"] = "false"
            ItemPost["loginurl"] = ""

        if bool(ItemPost["param"]):
            return ItemPost
        return None
Ejemplo n.º 12
0
    def generateTempitemForGetNoResp(self, response_url):
        parsed = urlparse.urlparse(response_url)
        parameters = urlparse.parse_qs(parsed.query)

        item = ProjectItem()
        url = parsed.geturl()
        if "?" in url:
            item['url'] = url[:url.find('?')]
        else:
            item['url'] = url

        item['param'] = parameters
        item['type'] = "GET"
        if self.login_required:
            item["loginrequired"] = "true"
            item["loginurl"] = self.login_url
        else:
            item["loginrequired"] = "false"
            item["loginurl"] = ""

        item["headers"] = {}
        return item
Ejemplo n.º 13
0
    def parse_new_page(self, response):
        # print(response.text)
        link_address='';
        content_word='';
        title=response.css('.show_con_title ::text').extract()[0]
        data=response.css('.show_con_info em:nth-child(1)::text').extract()[0][-19:-1]
        extence=response.css('.show_con_info em:nth-child(3)::text').extract()[0]
        content=response.css('.show_con_box p::text').extract()
        link=response.css('.show_con_box a[href^="http"]::attr(href) ').extract()
        for link_item in link:
            if link_item.find(".doc")!= -1 or link_item.find(".xls")!=-1 or link_item.find(".xlsx")!= -1:
                link_address=link_address+link_item+';'
        for content_item in content:
            if content_item.find("报名方式")!=-1 or content_item.find("应聘方法")!=-1 or content_item.find("报名时间")!=-1 or content_item.find("资格条件")!=-1:
                content_word=content_word+content_item+'   :'

        item=ProjectItem()
        item['title']=title
        item['data']=data
        item['extence'] = extence
        item['content'] = content_word
        item['link'] = link_address

        yield item
Ejemplo n.º 14
0
    def parseHtml(self, response):
        # 每个职位的节点对象列表
        baseList = response.xpath('//tr[@class="odd"] | //tr[@class="even"]')
        for base in baseList:
            item = ProjectItem()
            item["PositionName"] = base.xpath('./td[1]/a/text()').extract()[0]
            # 链接
            item["PositionLink"] = base.xpath('./td[1]/a/@href').extract()[0]

            # 类别
            item["PositionType"] = base.xpath('./td[2]/text()').extract()
            if item["PositionType"]:
                item["PositionType"] = item["PositionType"][0]
            else:
                item["PositionType"] = "无"

            # 人数
            item["PositionNum"] = base.xpath('./td[3]/text()').extract()[0]
            # 地点
            item["PositionAddress"] = base.xpath('./td[4]/text()').extract()[0]
            # 时间
            item["PositionTime"] = base.xpath('./td[5]/text()').extract()[0]

            yield item
Ejemplo n.º 15
0
 def parse(self, response):
     yield ProjectItem(name='updated_item', index=1, last=self.last)
     yield ProjectItem(name='new_item', index=1)
Ejemplo n.º 16
0
    def parse(self, response):
        #提取每个response的数据
        node_list = response.xpath("//div[@class='dw_table']/div[@class='el']")
        total_page_pig = response.xpath("//span[@class = 'td']/text()")
        #正则表达式提取页数
        pattern1 = re.compile(r"(?<=共).+?(?=页,)")  #我们在编译这段正则表达式
        matcher1 = re.search(pattern1,
                             r"" + str(total_page_pig))  #在源文本中搜索符合正则表达式的部分
        total_page = (matcher1.group(0))  #页数
        print(total_page)

        for node in node_list:
            # 新建一个item
            item = ProjectItem()
            print('进入!!')
            item['position_name'] = node.xpath("./p/span/a/@title").extract()[
                0]  #返回list中的第一个元素  职位
            item['posttion_place'] = node.xpath(
                "./span[1]/a/text()").extract()[0]  #公司
            item['posttion_led'] = node.xpath("./span[2]/text()").extract()[
                0]  #地点
            if len(node.xpath("./span[3]/text()")):
                item['position_department'] = node.xpath(
                    "./span[3]/text()").extract()[0]  #薪资
            else:
                item['position_department'] = '空'
            item['positon_time'] = node.xpath("./span[4]/text()").extract()[
                0]  #时间
            item['position_info'] = node.xpath("./span[1]/a/@href").extract()[
                0]  #网址
            print('1')
            yield item

        # s = (response.xpath("//li[@class='bk']/a/@href")).extract()[0]

        # print('\n\n\n\n\n\n',type(s),'\n',s,'\n\n\n\n\n\n\n')

        # s = (response.xpath("//li[@class='bk']/a/@href")).extract()[1]

        # print('\n\n\n\n\n\n',type(s),'\n',s,'\n\n\n\n\n\n\n')

        #打印当前页数
        print(
            '\n\n\n\n\n\n',
            str(
                response.xpath(
                    "//div[@class = 'p_in']/ul/li[@class='on']/text()").
                extract()[0]), '\n\n\n\n\n\n\n')

        # #判断当前页是不是最后一页
        if str(
                response.xpath(
                    "//div[@class = 'p_in']/ul/li[@class='on']/text()").
                extract()[0]) != total_page:
            if str(
                    response.xpath(
                        "//div[@class = 'p_in']/ul/li[@class='on']/text()").
                    extract()[0]) == '1':
                url = (
                    response.xpath("//li[@class='bk']/a/@href")).extract()[0]
                yield scrapy.Request(url, callback=self.parse)
            else:
                url = (
                    response.xpath("//li[@class='bk']/a/@href")).extract()[1]
                # print('\n\n\n\n\n',url,'\n\n\n\n\n\n')
                yield scrapy.Request(url, callback=self.parse)