Beispiel #1
0
    def parse_item_detail(self, response):
        item = response.meta['item']
        item['link'] = response.url
        infoDetailRaw = response.css('.STYLE15').extract()

        infos = response.css('.STYLE13').extract()
        location = None
        remark = None
        startTime = None
        if len(infos) >= 3:
            texts = [pq(infos[i]).text() for i in range(3)]
            # 时间:  2014/11/15 09:30
            m = re.search(r'\d{4}/\d{2}/\d{2} \d{2}:\d{2}', texts[0])
            if m:
                startTime = m.group()
                startTime = time.strptime(startTime, "%Y/%m/%d %H:%M")
                startTime = time.strftime(CustomUtil.time_format, startTime)
            # 地点:  世纪馆
            m = re.search(ur'地点:(.*)', texts[1], re.X)
            if m:
                location = m.group(1).strip()
            # 备注(要求):  宣讲后会有面试,请同学们踊跃参加!
            m = re.search(ur'备注(要求):(.*)', texts[2], re.X)
            if m:
                remark = m.group(1).strip()
        item['location'] = location
        item['remark'] = remark
        item['startTime'] = startTime

        item['infoDetailRaw'] = chc(infoDetailRaw)
        item['company'] = CompanyItem()
        # 因为公司数据不规则,无法直接提取公司信息
        return item
Beispiel #2
0
    def parse_item_detail(self, response):
        item = response.meta['item']
        item['link'] = response.url
        info_dic = {}
        # 采用字典匹配的方法
        trs = response.css('.bd_tab>tr')
        for tr in trs:
            tds = tr.css('tr>td')
            if (len(tds) & 1) == 0:
                i = 0
                key = None
                for td in tds:
                    td = pq(gfs(td.extract()))
                    # print i&1,td.text()
                    if (i & 1) == 1:
                        info_dic[key] = td.text().strip()
                    else:
                        key = td.text().strip()
                    i += 1
        cname = info_dic.get(u'单位名称')
        cpro = info_dic.get(u'单位性质')
        chome = info_dic.get(u'应聘网址')
        cemail = info_dic.get(u'简历投递邮箱')
        caddr = info_dic.get(u'单位地址')

        company = CompanyItem(cname,
                              prop=cpro,
                              homePage=chome,
                              email=cemail,
                              addr=caddr)
        infoDetailRaw = response.css('.bd_tab tr:nth-child(4) td').extract()
        item['company'] = company
        item['infoDetailRaw'] = chc(infoDetailRaw)
        return item
Beispiel #3
0
 def parse_detail(self, response):
     item = response.meta['item']
     item['issueTime'] = response.xpath("//div[@class='article-info']").re(r'</span>*(.*)')
     item['issueTime'] = chc(item['issueTime'], 1)
     item['infoDetailRaw'] = chc(response.xpath("//table[@class='job_detail']").extract())
     item['company']  = CompanyItem()
     yield item 
Beispiel #4
0
    def parse_item_detail(self, response):
        item = response.meta['item']
        item['link'] = response.url
        infoDetailRaw = None

        item['infoDetailRaw'] = infoDetailRaw

        cname = chc(response.css('h1.title::text').extract())
        cintro = response.css('.company-introduction').extract()
        cintro = CustomUtil.h2t(gfs(cintro))

        infos = response.css('#comTab tr')
        info_dic = {}
        for tr in infos:
            th = chc(tr.css('th::text').extract())
            td = chc(tr.css('td::text').extract())
            if th and td:
                info_dic[gfs(th).strip()] = gfs(td).strip()
        cpro = info_dic.get(u'公司性质:')
        cscale = info_dic.get(u'公司规模:')
        cind = info_dic.get(u'公司行业:')

        company = CompanyItem(cname,
                              intro=cintro,
                              prop=cpro,
                              scale=cscale,
                              industry=cind)
        item['company'] = company
        return item
Beispiel #5
0
    def parse_detail_page(self, response):
        item = response.meta['item']
        item['link'] = response.url
        title = response.css('.w-employee-title')
        # 如果数据错误,则进行重试
        if not title:
            ind = item['sid']
            if self.canTryAgain(ind, kind='detail_'):
                url = response.url + "&try_%d" % self.getTryCnt(ind,
                                                                kind='detail_')
                self.addTryCnt(ind, kind='detail_')
                yield scrapy.Request(url,
                                     callback=self.parse_detail_page,
                                     meta={'item': item})

        # 正常解析数据
        ta = response.css(
            '.w-zph-title tr:nth-child(3) td:nth-child(2)::text').extract()
        if len(ta):
            item['targetAcademic'] = ta[0]
        tm = response.css(
            '.w-zph-title tr:nth-child(3) td:nth-child(4)::text').extract()
        # print ta,tm
        if len(tm):
            item['targetMajor'] = tm[0]
        if item.get('targetMajor'):
            item['targetMajor'] = chc(item['targetMajor'])
        if item.get('targetAcademic'):
            item['targetAcademic'] = chc(item['targetAcademic'])
        item['company'] = CompanyItem()
        yield item
Beispiel #6
0
    def parse_item_detail(self, response):
        item = response.meta['item']
        item['link'] = response.url
        infos = response.css('td div::text').extract()

        startTime = None
        endTime = None
        location = None
        issueTime = None
        for info in infos:
            # 活动时间:2015-4-21 13:30—15:30
            info = info.strip()
            if info.__contains__(u'活动时间:'):
                startTime, endTime = CustomUtil.splitTimes(info.split(u':')[1])
            # 活动地点:光电学院演讲厅
            elif info.__contains__(u'活动地点:'):
                location = info.split(u':')[1]
            # 发布日期:2015-3-26 16:53:24
            elif info.__contains__(u'发布日期:'):
                issueTime = info.split(u':')[1]
                issueTime = time.strptime(issueTime,
                                          CustomUtil.time_format + ":%S")
                issueTime = time.strftime(CustomUtil.time_format, issueTime)

        item['startTime'] = startTime
        item['endTime'] = endTime
        item['location'] = location
        item['issueTime'] = issueTime
        item['company'] = CompanyItem()
        infoDetailRaw = None
        ts = response.css('table table')[0]
        infoDetailRaw = chc(
            ts.css('tr:nth-child(5) td:nth-child(2)').extract())
        item['infoDetailRaw'] = infoDetailRaw
        return item
Beispiel #7
0
 def parse_detail(self, response):
     item = response.meta['item']
     item['link'] = response.url
     item['image_urls'] = response.xpath("//div[@class='vContent']//img/@src").extract() 
     item['infoDetailRaw'] = response.xpath("//div[@class='vContent']").extract()
     item['company']  = CompanyItem()
     item['company']['introduction'] = response.xpath("//div[@class='vContent cl']/div").extract()
     yield item 
Beispiel #8
0
    def parse_item_detail(self, response):
        item = response.meta['item']
        item['link'] = response.url
        infoDetailRaw = response.css('.articleContext').extract()

        item['infoDetailRaw'] = chc(infoDetailRaw)
        item['company']  = CompanyItem()
        return item
Beispiel #9
0
 def parse_detail(self, response):
     item = response.meta['item']
     item['link'] = response.url
     item['infoDetailRaw'] = response.xpath(
         "//div[@class='posinfo'][1]").extract()
     item['company'] = CompanyItem()
     item['company']['introduction'] = response.xpath(
         "//div[@class='posinfo'][2]").extract()
     yield item
Beispiel #10
0
 def parse_detail(self, response):
     item = response.meta['item']
     item['link'] = response.url
     #item['image_urls'] = response.xpath("//div[@class='vContent']//img/@src").extract()
     info = response.xpath(
         "//form[@name='zpxxglActionForm']/table/tr").extract()
     item['infoDetailRaw'] = info[-1]
     item['company'] = CompanyItem()
     item['company']['introduction'] = response.xpath(
         "//div[@id='cent']/div[2]").extract()
     yield item
Beispiel #11
0
 def parse_detail(self, response):
     item = response.meta['item']
     item['link'] = response.url
     company = CompanyItem()
     company['name'] = response.xpath("/html/body/table[2]//tr/td[3]/table//tr[2]/td/table//tr[3]/td//table//tr[2]/td[1]/p").extract()
     item['company'] = company
     time1 = chc(response.xpath("/html/body/table[2]//tr/td[3]/table//tr[2]/td/table//tr[3]/td//table//tr[2]/td[2]/p").extract())
     time2 = chc(response.xpath("/html/body/table[2]//tr/td[3]/table//tr[2]/td/table//tr[3]/td//table//tr[2]/td[3]/p").extract())
     item['startTime'] = time1 + time2
     item['location'] = response.xpath("/html/body/table[2]//tr/td[3]/table//tr[2]/td/table//tr[3]/td//table//tr[2]/td[4]/p").extract()
     item['infoDetailRaw'] = response.xpath("/html/body/table[2]//tr/td[3]/table//tr[2]/td/table//tr[3]/td").extract()
     yield item 
Beispiel #12
0
    def parse_item_detail(self, response):
        item = response.meta['item']
        item['link'] = response.url
        infoDetailRaw = response.css('.event_content').extract()

        # 提取发布时间 发布时间: 2015-05-12 10:00
        pattern = '发布时间: ' + '(' + CustomUtil.time_pattern + ')'
        m = re.search(pattern, response.body)
        issueTime = None
        if m:
            issueTime = m.group(1)
        item['company'] = CompanyItem()
        item['infoDetailRaw'] = chc(infoDetailRaw)
        item['issueTime'] = issueTime

        # 因为公司数据不规则,无法直接提取公司信息
        return item
Beispiel #13
0
 def parse_detail(self, response):
     item = response.meta['item']
     item['link'] = response.url
     item['infoDetailRaw'] = response.xpath(
         "//div[@class='wznr']").extract()
     item['company'] = CompanyItem()
     #item['image_urls'] = response.xpath("//div[@class='vContent']//img/@src").extract()
     #item['infoDetailRaw'] = response.xpath("//div[@class='vContent']").extract()
     #item['location'] = response.xpath("//div[@class='wznr']/span/div[4]/span/text()").extract()
     #date = chc(response.xpath("//div[@class='wznr']/span/div[5]/span/text()").extract())
     #time = chc(response.xpath("//div[@class='wznr']/span/div[6]/span/text()").extract())
     #item['startTime'] = date + ' ' + time
     #detail1 =u'针对学历:\n' +  chc(response.xpath("//div[@class='wznr']/span/div[7]/span/text()").extract()) + '\n'
     #detail2 =u'针对专业:\n' +  chc(response.xpath("//div[@class='wznr']/span/div[8]/span/text()").extract()) + '\n'
     #detail3 =u'相关链接:\n' +  chc(response.xpath("//div[@class='wznr']/span/div[13]/span/text()").extract()) + '\n'
     #detail4 =u'备注:\n' +  chc(response.xpath("//div[@class='wznr']/span/div[14]/span/text()").extract()) + '\n'
     #item['infoDetailText'] = detail1 + detail2 + detail3 + detail4
     #item['company']  = CompanyItem()
     #item['company']['introduction'] = response.xpath("//div[@class='vContent cl']/div").extract()
     yield item
Beispiel #14
0
 def parse_detail(self, response):
     item = response.meta['item']
     item['link'] = response.url
     item['infoDetailRaw'] = response.xpath("//div[@class='content']/table//tr[4]/td").extract()
     item['company']  = CompanyItem()
     yield item