Python xpath Examples, urllib.response.xpath Python Examples

Example #1

0

Show file

File: A-12.py Project: AngusMonroe/academicConference

    def parse(self, response):
        # the rules how to deal with the pages you get
        # learn more about 'xpath' grammar
        conference = response.xpath('//td/a[1]').re("<a href=\"([^\"]*)\"")
        t = response.xpath('//a[1]/font[@style="line-height:130%;"]/text()').extract()

        csvfile = open('output/A-12.csv', 'a', newline='', errors='ignore', encoding='gbk')
        writer = csv.writer(csvfile)

        for i in range(len(conference)):
            print(conference[i])
            if re.findall("(\S*)\.pdf", conference[i]) or re.findall("(\S*)\.doc", conference[i]):
                name = conference[i].split('/')
                if not os.path.exists('file/A-12/'):
                    os.mkdir('file/A-12/')
                print('file/A-12/' + str(name[len(name) - 1]) + ' ' + conference[i])
                try:
                    urllib.request.urlretrieve(conference[i], 'file/A-12/' + str(name[len(name) - 1]))
                    writer.writerow([t[i], conference[i]])
                except Exception:
                    writer.writerow([t[i], ''])
                    print("Error!")
                    continue

            else:
                print("This is a html.")
                # yield Request(conference[i], meta={'url': conference[i]}, callback=self.parsecontent_temps)

        # close the file
        csvfile.flush()
        csvfile.close()

Example #2

0

Show file

File: cspider.py Project: squarlhan/MoodleSpider

    def parse_item(self, response):
        item = CsharpItem()

        sno = response.xpath(
            '//div[@class = "submission-full"]/div[@class = "header"]/h3[@class = "title"]/text()'
        ).extract()
        name = response.xpath(
            '//div[@class = "author"]/div[@class = "fullname"]/a/text()'
        ).extract()
        info = response.xpath(
            '//div[@class = "attachments"]/ul/li/a/@href').extract()

        item['name'] = name[0].replace(' ', '')
        item['sno'] = sno[0]
        item['info'] = info[0]

        yield item

        zipper = FileItem()
        # url = response.urljoin(info[0].replace('?forcedownload=1', ''))
        url = response.urljoin(info[0])
        zipper = FileItem()
        zipper['file_urls'] = [url]
        zipper['name'] = name[0].replace(' ', '')
        zipper['sno'] = sno[0]
        zipper['info'] = info[0]

        yield zipper

Example #3

0

Show file

    def parse(self, response):
        self.flag = True
        # 遍历当前页 所有人的主页链接地址
        cxy = response.xpath("//div[@class='user-avatar ui tiny image']//@href").extract()

        for i in cxy:
            yield scrapy.Request(url=i, callback=self.zh)

        # 爬取全站
        while self.flag != False:
            no = int(response.xpath("//div[@class='ui pagination menu']//a[@class='item active']//text()").extract()[0])
            if no > 9:
                if((no+10) % no ==0):
                    #10的倍数页的
                    next_page = response.xpath("//div[@class='ui pagination menu']//a//@href").extract()[2]
                else:
                    #10x-10y之间的
                    next_page = response.xpath("//div[@class='ui pagination menu']//a//@href").extract()[int(no%10+2)]
            else:
                #<10页以下的
                next_page = response.xpath("//div[@class='ui pagination menu']//a//@href").extract()[int(no)]
                # 查看有木有存在下一页链接
            if next_page is not None:
                next_page = response.urljoin(next_page)    #如果存在的话，我们使用：response.urljoin(next_page)把相对路径，如：page/1转换为绝对路径，
                                                           #其实也就是加上网站域名，如：http://lab.scrapyd.cn/page/1；
                self.flag = False
                yield scrapy.Request(next_page, callback=self.parse)

Example #4

0

Show file

File: A-11.py Project: AngusMonroe/academicConference

    def parsecontent_temps(self, response):
        # parameters can be got by response.meta['key']
        # url = response.meta['url']
        t = response.xpath('//div[@id="cke_pastebin"]/p[1]').extract()
        if not t:
            t = response.xpath('//td[@class="STYLE3"]/p[1]').extract()

        # content_temp = response.xpath('//td[@style="FONT-SIZE: 16pt; FONT-FAMILY: 宋体; mso-ascii-font-family: \'Times New Roman\'; mso-hansi-font-family: \'Times New Roman\'"]')[0].extract()

        # t = response.xpath('//p[@class="MsoNormal"]/span').re('style="FONT-SIZE: 22pt; COLOR: red; FONT-FAMILY: 宋体; mso-ascii-font-family: \'Times New Roman\'; mso-hansi-font-family: \'Times New Roman\'">(\S*)</span>')
        c = response.xpath('//div[@id="cke_pastebin"]/p[position()>1]').extract()
        if not c:
            c = response.xpath('//td[@class="STYLE3"]/p[position()>1]').extract()

        title = ''
        for i in t:
            title += i

        title = re.sub('<[^>]+>', '', title)
        title = re.sub('[\t\n\r]', '', title)
        title = re.sub(' +', ' ', title)

        content_temp = ''
        for i in c:
            content_temp += i

        # use regular expression to resolve the pages you get, especially html elements
        content_temp = re.sub('<[^>]+>', '', content_temp)
        content_temp = re.sub('[\t\n\r]', '', content_temp)
        content_temp = re.sub(' +', ' ', content_temp)

        # open the file and write and close it
        csvfile = open('output/A-11.csv', 'a', newline='', errors='ignore', encoding='gbk')
        writer = csv.writer(csvfile)
        if title == '':
            title = response.url
        writer.writerow([title, content_temp])
        csvfile.flush()
        csvfile.close()

        relative_urls = response.xpath('..//a[@href]').re("<a href=\"([^\"]*)\"")
        if relative_urls:
            for i in range(len(relative_urls)):
                relative_url = relative_urls[i]
                relative_url = re.sub('<[^>]+>', '', relative_url)
                relative_url = re.sub('[\t\n\r]', '', relative_url)
                relative_url = re.sub(' +', ' ', relative_url)
                absolute_url = 'http://www.gsc.org.cn/' + relative_url
                if re.findall("(\S*)\.pdf|(\S*)\.doc", absolute_url):
                    name = absolute_url.split('/')
                    if not os.path.exists('file/A-11/'):
                        os.mkdir('file/A-11/')
                    print('file/A-11/' + str(name[len(name) - 1]) + ' ' + absolute_url)
                    urllib.request.urlretrieve(absolute_url, 'file/A-11/' + str(name[len(name) - 1]))

Example #5

0

Show file

    def parsecontent_temps(self, response):
        # parameters can be got by response.meta['key']
        # url = response.meta['url']
        t = response.xpath(
            '//td[@style="width:700px;text-align:center;font-size:24px;font-weight:bold;line-height:40px;color:#000000;"]/text()'
        )[0].extract()

        # content_temp = response.xpath('//td[@style="FONT-SIZE: 16pt; FONT-FAMILY: 宋体; mso-ascii-font-family: \'Times New Roman\'; mso-hansi-font-family: \'Times New Roman\'"]')[0].extract()

        # t = response.xpath('//p[@class="MsoNormal"]/span').re('style="FONT-SIZE: 22pt; COLOR: red; FONT-FAMILY: 宋体; mso-ascii-font-family: \'Times New Roman\'; mso-hansi-font-family: \'Times New Roman\'">(\S*)</span>')
        c1 = response.xpath('//p[@align="left"]').extract()
        c2 = response.xpath('//p[@class ="MsoNormal"]').extract()
        c3 = response.xpath(
            '//p[@style="width:730px;text-align:left;line-height:28px;"]'
        ).extract()

        # c2 = response.xpath('//p[@style="text-align:left;text-indent:2em;font-family:SimSun;font-size:18px;"]').re("\">(\S*)</p>")
        title = t
        content_temp = ''
        for i in c1:
            content_temp += i
        for i in c2:
            content_temp += i
        for i in c3:
            content_temp += i

        # if re.findall("<a target=\"_blank\" href=\"(\S*)\.pdf\"", content_temp):
        #     file_url = re.findall("<a target=\"_blank\" href=\"[^\"]*\"", content_temp)
        #     file_name = re.findall("<a target=\"_blank\" href=\"[^\"]*\">(\S*)</a>", content_temp)
        #     file_url[0] = re.sub("<a target=\"_blank\" href=\"", "", file_url[0])
        #     file_url[0] = re.sub("\"", "", file_url[0])
        #     name = file_name[0].split('/')
        #     if not os.path.exists('file/A-08/'):
        #         os.mkdir('file/A-08/')
        #     print('file/A-08/' + str(name[len(name) - 1]) + ' ' + "http://www.cms1924.org" + file_url[0])
        #     urllib.request.urlretrieve("http://www.cms1924.org" + file_url[0], 'file/A-08/' + str(name[len(name) - 1]))

        # use regular expression to resolve the pages you get, especially html elements
        content_temp = re.sub('<[^>]+>', '', content_temp)
        content_temp = re.sub('[\t\n\r]', '', content_temp)
        content_temp = re.sub(' +', ' ', content_temp)

        # open the file and write and close it
        csvfile = open('output/A-10.csv',
                       'a',
                       newline='',
                       errors='ignore',
                       encoding='gbk')
        writer = csv.writer(csvfile)
        writer.writerow([title, content_temp])
        csvfile.flush()
        csvfile.close()

Example #6

0

Show file

    def parse(self, response):
        item = SlcoinItem()
        list = response.xpath('//*[@id="table"]/tbody')
        coin_type = list.xpath('//tr/td[2]/a/img/@alt').extract()
        coin_money = list.xpath('//tr/td[4]/a/@data-usd').extract()
        coin_upDown = list.xpath('//tr/td[7]/span/text()').extract()
        for i in range(0, 100):
            if coin_type[i] == 'BTC-比特币' or coin_type[i] == 'ETH-以太坊' or coin_type[i] == 'EOS-柚子' or coin_type[
                i] == 'LTC-莱特币' or \
                    coin_type[i] == 'XMR-门罗币' or coin_type[i] == 'NEO-小蚁' or coin_type[i] == 'ETC-以太经典' or coin_type[
                i] == 'OMG-嫩模币' or \
                    coin_type[i] == 'BNB-币安币' or \
                    coin_type[i] == 'HT-火币积分' or coin_type[i] == 'ZEC-大零币' or coin_type[i] == 'DCR' or coin_type[
                i] == 'MKR' or \
                    coin_type[i] == 'REP' or coin_type[i] == 'GXS-公信宝' or coin_type[i] == 'EMC-崛起币' or coin_type[
                i] == 'ZEN' or \
                    coin_type[i] == 'VERI' or coin_type[i] == 'XZC-小零币' or coin_type[i] == 'FCT-公证通':
                item = SlcoinItem()
                item['coin_type'] = re.sub("[\u4e00-\u9fa5]|-", "",
                                           coin_type[i])
                item['coin_money'] = coin_money[i]
                item["time"] = time.strftime('%Y-%m-%d %H-%M-%S',
                                             time.localtime(time.time()))
                item["coin_upDown"] = coin_upDown[i]
                yield item
            else:
                print("I do not want it")

                next_page = ['list_2.html#USD']
                next_page = ','.join(next_page)
                if next_page is not None:
                    next_page = response.urljoin(next_page)
                    yield scrapy.Request(next_page, callback=self.parse)

Example #7

0

Show file

File: cspider.py Project: squarlhan/MoodleSpider

 def parse_url(self, response):
     urls = response.xpath(
         '//tr/td[@class = "submission cell c1"]/a[@class = "title"]/@href'
     ).extract()
     for url in urls:
         yield scrapy.Request(url=url,
                              headers=self.headers,
                              cookies=self.cookies,
                              callback=self.parse_item)

Example #8

0

Show file

 def parseAppDetail(self, response):
     print('-------11111, meta ' + str(response.meta))
     appRank = response.meta['rank']
     print('-------2222, flag ' + str(appRank))
     appName = response.xpath(
         '//*[@id="J_DetDataContainer"]/div/div[1]/div[2]/div[1]/div[1]/text()'
     ).extract()[0]
     appDownload = response.xpath(
         '//*[@id="J_DetDataContainer"]/div/div[1]/div[2]/div[3]/div[1]/text()'
     ).extract()[0]
     appType = response.xpath('//*[@id="J_DetCate"]/text()').extract()[0]
     item = TestscrapytemplateItem()
     item['appName'] = appName
     item['appDownloadCount'] = appDownload
     item['appType'] = appType
     item['appLink'] = response.url
     item['appRank'] = appRank
     print('-------222, appName ' + appName + ' type = ' + appType +
           ' download =- ' + appDownload)
     yield item

Example #9

0

Show file

    def parsecontent_temps(self, response):
        # parameters can be got by response.meta['key']
        # url = response.meta['url']

        title = response.xpath(
            '//td[@class ="contentTitle"]/text()')[0].extract()
        content_temp = response.xpath(
            '//td[@class="contentDetail"]')[0].extract()

        if re.findall("<a target=\"_blank\" href=\"(\S*)\.pdf\"",
                      content_temp):
            file_url = re.findall("<a target=\"_blank\" href=\"[^\"]*\"",
                                  content_temp)
            file_name = re.findall(
                "<a target=\"_blank\" href=\"[^\"]*\">(\S*)</a>", content_temp)
            file_url[0] = re.sub("<a target=\"_blank\" href=\"", "",
                                 file_url[0])
            file_url[0] = re.sub("\"", "", file_url[0])
            name = file_name[0].split('/')
            if not os.path.exists('file/A-08/'):
                os.mkdir('file/A-08/')
            print('file/A-08/' + str(name[len(name) - 1]) + ' ' +
                  "http://www.cms1924.org" + file_url[0])
            urllib.request.urlretrieve("http://www.cms1924.org" + file_url[0],
                                       'file/A-08/' + str(name[len(name) - 1]))

        # use regular expression to resolve the pages you get, especially html elements
        content_temp = re.sub('<[^>]+>', '', content_temp)
        content_temp = re.sub('[\t\n\r]', '', content_temp)
        content_temp = re.sub(' +', ' ', content_temp)

        # open the file and write and close it
        csvfile = open('output/A-08.csv',
                       'a',
                       newline='',
                       errors='ignore',
                       encoding='gbk')
        writer = csv.writer(csvfile)
        writer.writerow([title, content_temp])
        csvfile.flush()
        csvfile.close()

Example #10

0

Show file

File: weibo.py Project: wangjinliang1991/weibo_gupiao_scrapy_tushare

 def parse_detail(self, response):
     id = re.search('comment\/(.*?)\?', response.url).group(1)
     url = response.url
     content = ''.join(
         response.xpath(
             '//div[@id="M_"]//span[@class="ctt"]//text()').extract())
     print(id, url, content)
     comment_count = response.xpath(
         '//span[@class="pms"]//text()').re_first('评论\[(.*)]')
     forward_count = response.xpath(
         '//a[contains(.,"转发[")]//text()').re_first('转发\[.*]')
     like_count = response.xpath('//a[contains(.,"赞[")]').re_first(
         '赞\[(.*)]')
     print(comment_count, forward_count, like_count)
     posted_at = response.xpath('//div[@id="M_"]//span[@class="ct"]//text()'
                                ).extract_first(default=None)
     user = response.xpath('//div[@id="M_"]/div[1]/a/text()').extract_first(
         default=None)
     keyword = response.meta['keyword']
     weibo_item = WeiboItem()
     for field in weibo_item.fields:
         try:
             weibo_item[field] = eval(field)
         except NameError:
             self.logger.debug("Field is not defined" + field)
     yield weibo_item

Example #11

0

Show file

 def parse(self, response):
     nodesAll = response.xpath(
         '//*[@id = "J_RankTabBody"]/li[2]/ul/child::*').extract()
     rankNum = len(nodesAll)
     for i in range(rankNum):
         appName = response.xpath('//*[@id="J_RankTabBody"]/li[2]/ul/li[' +
                                  str(i + 1) +
                                  ']/div[1]/a/text()').extract()[0]
         appLink = response.xpath('//*[@id="J_RankTabBody"]/li[2]/ul/li[' +
                                  str(i + 1) + ']/a/@href').extract()[0]
         appRank = i
         appDownloadCount = response.xpath(
             '//*[@id="J_RankTabBody"]/li[2]/ul/li[' + str(i + 1) +
             ']/div[1]/div[1]/span/text()').extract()[0]
         print('parse.....rank = ' + str(appRank) + ', link = ' +
               str(appLink) + ', downloadCount = ' + str(appDownloadCount) +
               ' appName = ' + str(appName))
         detailLink = str(appLink).replace('..', 'https://sj.qq.com')
         print(detailLink)
         yield scrapy.Request(detailLink,
                              self.parseAppDetail,
                              meta={'rank': appRank},
                              flags=[appRank, 'sdfsdf'])

Example #12

0

Show file

File: weibo.py Project: wangjinliang1991/weibo_gupiao_scrapy_tushare

 def parse_index(self, response):
     # print(response.text)
     weibos = response.xpath('//div[@class="c" and contains(@id, "M_")]')
     print(weibos)
     for weibo in weibos:
         is_forward = bool(
             weibo.xpath('.//span[@class="cmt"]').extract_first())
         if is_forward:
             detail_url = weibo.xpath(
                 './/a[contains(., "原文评论[")]//@href').extract_first()
         else:
             detail_url = weibo.xpath(
                 './/a[contains(., "评论[")]//@href').extract_first()
         print(detail_url)
         yield Request(detail_url, callback=self.parse_detail)

Example #13

0

Show file

File: A-11.py Project: AngusMonroe/academicConference

    def parse(self, response):
        # the rules how to deal with the pages you get
        # learn more about 'xpath' grammar
        conference = response.xpath('//td[@height="24"]/a').re("<a href=\"([^\"]*)\"")

        # if the page you want is a secondary page and you can only get their urls, you can collect the urls and then use function Requset()
        urls = []

        for i in range(len(conference)):
            url = 'http://www.gsc.org.cn' + conference[i]  # conference[i].xpath('//a/@href').extract()[0]
            print(url)
            urls.append(url)
        for url in urls:
            # parameters can be passed by meta={'key':value}
            yield Request(url, meta={'url': url}, callback=self.parsecontent_temps)

Example #14

0

Show file

    def parse(self, response):
        # the rules how to deal with the pages you get
        # learn more about 'xpath' grammar
        conference = response.xpath(
            '//td[@style="text-align:left;line-height:28px;border-bottom:1px #DBDBDB dashed;"]/a'
        ).re("<a href=\"([^\"]*)\"")

        # if the page you want is a secondary page and you can only get their urls, you can collect the urls and then use function Requset()
        urls = []

        for i in range(len(conference)):
            conference[i] = re.sub('amp;', '', conference[i])
            url = 'http://www.geosociety.org.cn/' + conference[
                i]  # conference[i].xpath('//a/@href').extract()[0]
            print(url)
            urls.append(url)
        for url in urls:
            # parameters can be passed by meta={'key':value}
            yield Request(url,
                          meta={'url': url},
                          callback=self.parsecontent_temps)

Example #15

0

Show file

    def parse_subpage(self, response):
        # print("subpage")
        try:
            global video_url, url, duration, video_keywords, title, description, image, modified_time, category, lang, modified_video_keywords
            url = response.xpath(
                "//meta[@property = 'og:url']/@content").extract_first()
            langcheck = response.meta['langcheck']
            try:
                if (langcheck == "tamil"):
                    url = response.xpath(
                        '//script[contains(.,"__html5playerdata")]').re(
                            '"media":"(.+)"')[0]
                else:
                    url = response.xpath(
                        '//script[contains(., "__html5playerdata")]/text()'
                    ).re('"media_mp4":"(.+)"')[0]

                video = url.split('","')[0]
                video_url = video.replace('\\', '')
                # print(video_url)
                url = response.xpath(
                    "//meta[@property = 'og:url']/@content").extract_first()
                # print(url)
                title = response.xpath(
                    "//meta[@property = 'og:title']/@content").extract_first()
                # print(title)
                description = response.xpath(
                    "//meta[@property = 'og:description']/@content"
                ).extract_first()
                # print(description)
                image = response.xpath(
                    "//meta[@property = 'og:image']/@content").extract_first()
                # print(image)
                video_keywords = response.xpath(
                    "//meta[@name = 'keywords']/@content").extract_first()
                # print(video_keywords)
                duration = response.xpath(
                    '//script[@type = "application/ld+json"]/text()').re(
                        '"duration": +"(.+)"')[0]
                # print(duration)
                duration = duration.replace('PT', '0:').replace('M',
                                                                ':').replace(
                                                                    'S', '')
                # print(duration)
                # yield {
                #     'food': response.xpath("//meta[@property = 'og:title']/@content").extract_first()
                # }
                category = response.meta['category']
                lang = response.meta['lang']
                if (langcheck == 'tamil'):
                    lang = "tamil"
                else:
                    try:
                        title.encode(encoding='utf-8').decode('ascii')
                    except UnicodeDecodeError:
                        lang = "hindi"
                    else:
                        lang = "english"

            except:
                print("EXCEPTION HIT")
            try:
                time = duration.split(':')
                time1 = time[0] if int(time[0]) > 9 else '0' + time[0]
                time2 = time[1] if int(time[1]) > 9 else '0' + time[1]
                time3 = time[2] if int(time[2]) > 9 else '0' + time[2]
                modified_time = str(time1 + ":" + time2 + ":" + time3)
            except:
                print("TIME ERROR")
            # print(video_url, duration, title, description, image, video_keywords)
            try:
                modified_video_keywords = [
                    x.strip() for x in video_keywords.split()
                ]
            except:
                print("SPLIT ERROR")
            keywords = Queries.insert_keywords(self, modified_video_keywords)

            insertObject = {
                "video_title": title,
                "video_slug": slugify(title),
                "video_link": video_url,
                "video_description": description,
                "broadcaster": "5d3e9dde3b6d5e43e2ef58ec",  #Not yet changed
                "videoformat": "5ce4f7eda5c038104cb76648",  #Not yet changed
                "video_image": image,
                "videokeywords": keywords,
                "page_url": response.url,
                "duration": modified_time,
                "category": category,
                "language": lang,
                "keywords": ' | '.join(map(str, modified_video_keywords))
            }
            if ((len(video_url) > 0 and len(image) > 0
                 and len(modified_time) == 8) and
                (video_url.endswith('.m3u8') or video_url.endswith('.mp4'))):
                # print("aaa------------------aaaa")
                # print(title)
                # print(video_url)
                # print(modified_time)
                # print(image)
                # print(category)
                # print(lang)
                # # # print(insertObject)
                # print("bbb------------------bbb")
                result = Queries.insert_api(self, insertObject)
                if result.status_code == 200:
                    print("INSERTED")
                else:
                    print(result.status_code)
                    print(result)

        except:
            print("URL MISSING")

Example #16

0

Show file

File: GoGo.py Project: DayRed/Scrapy_Export_ExcelMysqlRedis

 def parse(self, response):
     # 遍历当前页 所有人的主页链接地址
     cxy = response.xpath("//div[@class='user-avatar ui tiny image']//@href").extract()
     for i in cxy:
         yield scrapy.Request(url=i, callback=self.zh, dont_filter=True)  # 爬取到的页面如何处理？提交给zh方法处理

Example #17

0

Show file

File: GoGo.py Project: DayRed/Scrapy_Export_ExcelMysqlRedis

    def zh(self, response):
        item = ForprogrammerItem()

        '''
        introduction = response.xpath("string(//div[@class='introduction'])").extract()[0]                              #个人简介
        item['introduction'] = introduction.replace('\n', '').replace(' ','').replace(',','-')

        workAndStudyExperience = response.xpath("//ul[@class='J_Works']//text()").extract()                             #工作经历+教育经历
        if len(workAndStudyExperience)>0:
            item['workAndStudyExperience'] = ','.join(workAndStudyExperience).replace('\n', '').replace(' ','').replace(',','-')
        else:
            item['workAndStudyExperience'] = '无'

        skill = response.xpath("//div[@class='skill-list']//text()").extract()                                          #技能
        if len(skill)>0:
            item['skill'] = ','.join(skill).replace('\n', '').replace(' ','').replace(',','-')
        else:
            item['skill'] = '无'

        moneyAndWork = response.xpath("//div[@class='hire-info']//p//text()").extract()                                 #获取薪资及工作特点
        if len(moneyAndWork)>0:
            item['moneyAndWork'] = ','.join(moneyAndWork).replace('\n', '').replace(' ','').replace(',','-')
        else:
            item['moneyAndWork'] = '无'

        zuopin = response.xpath(                                                                                        #作品
            "//div[@class='work-list']//ul//li//a[@class='media']//div[@class='info']//p//text()").extract()
        item['zuopin'] = ','.join(zuopin).replace(' ', '')
        '''
        # ------------------ 以下内容是对照数据库 二次再加的-------------------------------------
        #城市
        user_city = response.xpath("//div[@class='introduction']//text()").extract()
        if len(user_city)==3:
            user_city = user_city[1]
        elif len(user_city)==2:
            user_city = '-'

        #现在公司名称
        user_nowcompany = response.xpath("//div[@class='introduction']//text()").extract()
        if len(user_nowcompany)==2:
            user_nowcompany = user_nowcompany[1].split()[0]
        elif len(user_nowcompany)==3:
            user_nowcompany = user_nowcompany[2].split()[0]

        #现在公司职称
        user_nowoccupation = response.xpath("//div[@class='introduction']//text()").extract()
        if len(user_nowoccupation)==2:
            user_nowoccupation = user_nowoccupation[1].split()[1]
        elif len(user_nowoccupation)==3:
            user_nowoccupation = user_nowoccupation[2].split()[1]

        #个人介绍
        user_introduction = response.xpath("string(//div[@class='overflowhidden editor-style content'])").extract()[0]
        user_introduction = user_introduction.replace('\n', '').replace(' ', '').replace(',', '-')

        #期望日薪
        user_expectsalary = response.xpath("string(//div[@class='hire-info']//p[@class='work-price'])").extract()[0]
        user_expectsalary = user_expectsalary

        #毕业院校
        user_school = response.xpath("//div[@class='panel proginn-work-history'][last()]//p[@class='title']//span[2]//text()").extract()[0]
        user_school = user_school

        #评论数
        user_comment_number = response.xpath("//div[@id='proginn_wo_omment']//div[@class='content']//div[@class='content']//a").extract()
        user_comment_number = len(user_comment_number)

        #爬取源链接
        source_link = response.xpath("//head//link[@rel='canonical']//@href").extract()[0]
        source_link = source_link

        #非假期每日工作时长
        #user_noworktime = response.xpath("//div[@class='hire-info']//p[last()]//text()").extract()[0]
        #item['user_noworktime'] = user_noworktime

        #github
        #github = response.xpath("//div[@class='social-list']//a/@href").extract()[0]
        #item['github'] = github

        #能力描述
        user_ability_describe = response.xpath("//div[@class='verify']//text()").extract()
        user_ability_describe = ','.join(user_ability_describe).lstrip().replace(',',' ').lstrip()

        #姓名
        user_name = response.xpath("//a[@class='header']//text()").extract()[0]
        user_name = user_name

        #头像
        user_picturehead = response.xpath("//div[@class='four wide column side-profile']//a//img//@src").extract()[0]
        user_picturehead = user_picturehead

        #用户详情的id
        details_id = response.xpath("//head//link[@rel='canonical']//@href").extract()[0]
        details_id = details_id[27:]

        item = ForprogrammerItem(
            user_city = user_city,
            user_nowcompany = user_nowcompany,
            user_nowoccupation= user_nowoccupation,
            user_introduction = user_introduction,
            user_expectsalary = user_expectsalary,
            user_school= user_school,
            user_comment_number = user_comment_number,
            source_link = source_link,
            user_ability_describe = user_ability_describe,
            user_name = user_name,
            user_picturehead = user_picturehead,

            details_id = details_id
        )

        yield item