Beispiel #1
0
 def parseCupMatch(self, response):
     datas = Selector(response).xpath('//table[@class="lrace_bei"]//td/a').extract()
     for data in datas:
         try:
             lsName = Selector(text=data).xpath('//a/text()').extract()[0]
             if lsName == response.meta['lsName']:
                 url = Selector(text=data).xpath('//a/@href').extract()[0]
                 url = 'http://liansai.500.com{0}'.format(url)
                 html = requests.get(url)
                 ssName = Selector(text=html.text).xpath('//div[@class="ldrop_bd"]//li[@class="ldrop_list_on"]/a/text()').extract()[0]
                 url = Selector(text=html.text).xpath('//div[@class="lcol_tit_r"]/a/@href').extract()[0]
                 url = 'http://liansai.500.com{0}'.format(url)
                 # 获取第一分类地址
                 html = requests.get(url)
                 classes = Selector(text=html.text).xpath('//div[@id="match_stage"]/a[@data-id]').extract()
                 for cs in classes:
                     url = Selector(text=cs).xpath('//a/@href').extract()[0]
                     istid = url.find('jifen-')
                     stid = url[istid + 6:][:-1]
                     url = 'http://liansai.500.com{0}'.format(url)
                     mClass1 = Selector(text=cs).xpath('//a/text()').extract()[0]
                     yield Request(url=url, callback=self.parseBsData, meta={'lsName': response.meta['lsName'], 'ssName': ssName, 'stid': stid, 'mClass1': mClass1})
                 return
         except Exception as e:
             print('parseBsData fail: {0}'.format(e))
Beispiel #2
0
 def parse(self, response):
     rows = response.xpath('//*[@id="table-buildings"]/tbody/tr').extract()
     for row in rows:
         blgName = Selector(
             text=row).xpath('//td[4]/a/text()').extract()[0].strip()
         blgCity = Selector(text=row).xpath('//td[5]/a/text()').extract()[0]
         blgCountry = Selector(
             text=row).xpath('//td[5]/a/text()').extract()[1]
         blgFloor = Selector(text=row).xpath('//td[8]/text()').extract()[0]
         blgPurpose = Selector(
             text=row).xpath('//td[11]/text()').extract()[0].strip()
         blgUrl = "https://skyscrapercenter.com/" + Selector(
             text=row).xpath('//td[4]/a/@href').extract()[0].strip()
         hgtRank = Selector(text=row).xpath('//td[1]/text()').extract()[0]
         hgtFeet = Selector(
             text=row).xpath('//td[7]/text()').extract()[0].replace(
                 ",", "")
         isMultiPurpose = "Y" if blgPurpose.find("/") != -1 else "N"
         forOffice = "Y" if blgPurpose.find("office") != -1 else "N"
         forResidential = "Y" if blgPurpose.find(
             "residential") != -1 else "N"
         forHotel = "Y" if blgPurpose.find("hotel") != -1 else "N"
         forRetail = "Y" if blgPurpose.find("retail") != -1 else "N"
         yrComplete = Selector(
             text=row).xpath('//td[9]/text()').extract()[0]
         item = SkyscraperItem()
         item['blgName'] = blgName
         item['blgCity'] = blgCity
         item['blgCountry'] = blgCountry
         item['blgFloor'] = blgFloor
         item['blgPurpose'] = blgPurpose
         item['blgUrl'] = blgUrl
         item['hgtRank'] = hgtRank
         item['hgtFeet'] = hgtFeet
         item['isMultiPurpose'] = isMultiPurpose
         item['forOffice'] = forOffice
         item['forResidential'] = forResidential
         item['forHotel'] = forHotel
         item['forRetail'] = forRetail
         item['yrComplete'] = yrComplete
         request = scrapy.Request(blgUrl, callback=self.parse_building_page)
         request.meta[
             'item'] = item  # use 'request.meta' to pass the partial filled 'item' to parse_building_page
         yield request
Beispiel #3
0
 def parse(self, response):
     print(response.url)
     links_data = Selector(response).xpath('//script[contains(., "var data = ")]/text()').extract()[0]
     l, r = links_data.find('{'), links_data.rfind('}')
     if l  == -1 or r == -1:
         yield Request(response.url)  # failed, and retry!
     else:
         article_links = [item['url'] for item in json.loads(links_data[l:(r+1)].encode('utf-8', 'ignore'))['result']]
         with open('root/csdn.%s.content'%(self.tag), 'ab') as fp:
             fp.write('\n'.join(article_links + ['']))
Beispiel #4
0
    def parse(self, response):
        """TODO: Docstring for pass.

        :response: TODO
        :returns: TODO

        """
        for item in self._parse_posts(response):
            if not self.should_stop(item):
                yield item
            else:
                return

        if len(Selector(response).css('#frs_list_pager .next')):
            #贴吧的分页有的不是完整的链接
            next_page_url = Selector(response).css('#frs_list_pager .next::attr(href)').extract_first()
            logging.debug('next_page_url %s', next_page_url)
            if -1 != next_page_url.find('http://tieba.baidu.com'):
                yield Request(next_page_url, callback=self.parse)
            else:
                yield Request('http://tieba.baidu.com' + next_page_url, callback=self.parse)
Beispiel #5
0
    def parse(self, response):
        """TODO: Docstring for pass.

        :response: TODO
        :returns: TODO

        """
        for item in self._parse_posts(response):
            if not self.should_stop(item):
                yield item
            else:
                return

        if len(Selector(response).css('#frs_list_pager .next')):
            #贴吧的分页有的不是完整的链接
            next_page_url = Selector(response).css(
                '#frs_list_pager .next::attr(href)').extract_first()
            logging.debug('next_page_url %s', next_page_url)
            if -1 != next_page_url.find('http://tieba.baidu.com'):
                yield Request(next_page_url, callback=self.parse)
            else:
                yield Request('http://tieba.baidu.com' + next_page_url,
                              callback=self.parse)
Beispiel #6
0
    def parse(self, response):
        item = BaidukeyItem()
        if (response.url.find('www.baidu.com') != -1):
            #print(response.request.headers.get('User-Agent', None), 1111111111111111)
            #print(response.url)
            for target_a in response.xpath(
                    '//div[@id="rs"]/table/tr/th/a').extract():
                keyword = Selector(
                    text=target_a).xpath('//a/text()').extract_first()
                href = Selector(
                    text=target_a).xpath('//a/@href').extract_first()
                if (keyword.find('干洗') != -1 or keyword.find('洗衣') != -1):
                    if (keyword.find('洗衣粉') == -1 and keyword.find('洗衣液') == -1
                            and keyword.find('洗衣机') == -1):
                        item['keywords'] = keyword
                        item['types'] = '百度pc'
                        item['status'] = 0
                        yield item
                        fullhref = response.urljoin(href)
                        yield scrapy.Request(url=fullhref, callback=self.parse)
        elif (response.url.find('m.baidu.com') != -1):
            #print(response.request.headers.get('User-Agent', None))
            for target_a in response.xpath(
                    '//div[@id="relativewords"]/div[@class="rw-list"]/a'
            ).extract():
                keyword = Selector(
                    text=target_a).xpath('//a/text()').extract_first()
                href = Selector(
                    text=target_a).xpath('//a/@href').extract_first()
                if (keyword.find('干洗') != -1 or keyword.find('洗衣') != -1):
                    if (keyword.find('洗衣粉') == -1 and keyword.find('洗衣液') == -1
                            and keyword.find('洗衣机') == -1):
                        item['keywords'] = keyword
                        item['types'] = '百度移动'
                        item['status'] = 0
                        yield item
                        fullhref = response.urljoin(href)
                        yield scrapy.Request(url=fullhref, callback=self.parse)
        elif (response.url.find("www.so.com") != -1):
            for target_a in response.xpath(
                    '//div[@id="rs"]/table/tr/th/a').extract():
                keyword = Selector(
                    text=target_a).xpath('//a/text()').extract_first()
                href = Selector(
                    text=target_a).xpath('//a/@href').extract_first()
                if (keyword.find('干洗') != -1 or keyword.find('洗衣') != -1):
                    if (keyword.find('洗衣粉') == -1 and keyword.find('洗衣液') == -1
                            and keyword.find('洗衣机') == -1):
                        item['keywords'] = keyword
                        item['types'] = '好搜'
                        item['status'] = 0
                        yield item
                        fullhref = response.urljoin(href)
                        yield scrapy.Request(url=fullhref, callback=self.parse)

        elif (response.url.find("sogou.com") != -1):
            for target_a in response.xpath(
                    '//table[@id="hint_container"]/tr/td/p/a').extract():
                keyword = Selector(
                    text=target_a).xpath('//a/text()').extract_first()
                href = Selector(
                    text=target_a).xpath('//a/@href').extract_first()
                if (keyword.find('干洗') != -1 or keyword.find('洗衣') != -1):
                    if (keyword.find('洗衣粉') == -1 and keyword.find('洗衣液') == -1
                            and keyword.find('洗衣机') == -1):
                        item['keywords'] = keyword
                        item['types'] = '搜狗'
                        item['status'] = 0
                        yield item
                        fullhref = response.urljoin(href)
                        yield scrapy.Request(url=fullhref, callback=self.parse)
 def parse_school(self, response):
     jiaoyubao_url = re.match(r'(http://\w+\.jiaoyubao\.cn)/.*',
                              response.url).group(1)
     item = response.meta['item']
     s = Selector(response)
     course_urls = s.xpath(
         '//div[@class="ZcTabSerP"]/div/a/@href').extract()
     pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=1)
     r = redis.Redis(connection_pool=pool)
     pipe = r.pipeline(transaction=True)
     for course in course_urls:
         courses_url = urljoin(jiaoyubao_url, course)
         pipe.lpush("jyb_course_urls", courses_url)
     pipe.execute()  # 保存学校页面的课程链接到reids
     name = s.xpath(
         '//div[1]/div[1]/div/div[1]/div[4]/text()').extract_first()
     features = s.xpath('//div[@class="item2"]').extract_first()
     feature = ''
     if features:
         feature = remove_tags(features)
     msg = s.xpath('//div[@class="content_description"]')
     intros2 = s.xpath('//div[@class="ZcTabC"]').extract()
     intros1 = s.xpath(
         '//div[@class="ComTab"]/div[@class="ComTab_Item"][2]/div[@class="ComTab_Item_body"]'
     ).extract()
     intro = ''
     if msg:
         #src = msg.xpath('img/@href').extract_first()
         intros = msg.xpath('p').extract()
         for text in intros:
             if text != ' ':
                 intro += remove_tags(text.strip())
     elif intros1:
         for text in intros1:
             if text != ' ':
                 intro += remove_tags(text.strip())
     elif intros2:
         for text in intros2:
             if text != ' ':
                 intro += remove_tags(text.strip())
     else:
         intros3 = s.xpath('//a[@name="机构简介"]/../div').extract()
         for text3 in intros3:
             if text3 != ' ':
                 intro += remove_tags(text3.strip())
     srcs = s.xpath(
         '//div[@class="j j_Slide loading"]/div/ol/li/img/@src').extract()
     if not srcs:
         srcs = s.xpath(
             '//div[@class="j j_Slide loading"]/ol/li/img/@src').extract()
         if not srcs:
             srcs = s.xpath('//li[@class="J_ECPM"]/img/@src').extract()
     pic = []
     if srcs:
         i = 0
         for src in srcs:
             src3 = src
             year = time.strftime('%Y', time.localtime(time.time()))
             month = time.strftime('%Y.%m', time.localtime(time.time()))
             day = time.strftime('%Y.%m.%d', time.localtime(time.time()))
             t = int(time.time())
             num = random.randint(1000, 9999)
             filename = str(t) + str(num) + '.png'
             filepath2 = self.path2 + year + '/' + month + '/' + day + '/' + filename
             fileYear = self.path1 + year
             fileMonth = fileYear + '/' + month
             fileDay = fileMonth + '/' + day
             filepath1 = fileDay + '/' + filename
             if not os.path.exists(fileYear):
                 os.mkdir(fileYear)
                 os.mkdir(fileMonth)
                 os.mkdir(fileDay)
             else:
                 if not os.path.exists(fileMonth):
                     os.mkdir(fileMonth)
                     os.mkdir(fileDay)
                 else:
                     if not os.path.exists(fileDay):
                         os.mkdir(fileDay)
             try:
                 urlretrieve(src3, filepath1)
             except:
                 urlretrieve(urllib.parse.quote(src, safe=string.printable),
                             filepath1)
             pic.append(filepath2)
             i += 1
             if i == 5:
                 break
     js = s.xpath('//div[@class="wangdian"]/span/@onclick').extract_first()
     if not js:
         js = s.xpath(
             '//div[@class="ZcPoint"]/div[@class="pa"]/div[@class="pa02"]/@onclick'
         ).extract_first()
         if not js:
             js = s.xpath(
                 '//div[@class="tl3_dd2"]/div[@class="tl3_dr2"]/span/@onclick'
             ).extract_first()
     datas = ''
     maps = []
     address = []
     tel = ''
     city = ''
     if js:
         datas = js[27:-1].split(',')
     if datas:
         try:
             arg = int(datas[3])
         except:
             arg = int(datas[3][1:-1])
         cityid = int(datas[9][4:-1])
         #经过网页跟踪发现定位数据需要传入参数并从http://api.jiaoyubao.cn/map/Ajax.aspx动态加载而来
         data = {
             "os": 1,
             "arg": arg,
             "city": cityid,
             "page": 1,
             "pagesize": 10,
             "key": ''
         }
         map_url = "http://api.jiaoyubao.cn/map/Ajax.aspx"
         if data:
             r = requests.post(map_url, data=data)
             s = BeautifulSoup(r.text, "html.parser")
             point1 = s.find_all('point')
             city = s.points['cityname']
             p = s.find('point')
             if not name:
                 name = p.get('cp_name')
             tel1 = p.get('u400')
             tel2 = p.get('tel400')
             if tel2:
                 tel = tel1 + '转' + tel2
             else:
                 tel = tel1
             for point in point1:
                 map = []
                 addr = ''
                 campus = point.get('name')
                 campus_address = point.get('address')
                 lng = point.get('lng')
                 lat = point.get('lat')
                 addr = point.get('name') + '/' + point.get('address')
                 value = lng, lat
                 map.append(campus)
                 map.append(campus_address)
                 map.append(value)
                 maps.append(map)
                 address.append(addr)
     else:
         citys = s.xpath(
             '//div[@class="Item_ComTop1"]/div[1]/div[1]/a[2]/text()'
         ).extract_first()
         if citys:
             city = citys[:(len(citys) - 3)]
         tels = s.xpath('//div[@class="ZcTel"]/div[1]')
         tel1 = tels.xpath('span[2]/text()').extract_first()
         tel2 = tels.xpath('text()').extract_first()
         if tel2:
             tel = tel1 + '转' + tel2
         else:
             tel = tel1
     item['city'] = city
     item['tel'] = tel
     item['address'] = address
     item['average_price'] = None
     item['location'] = maps
     item['teacher_num'] = None
     item['pic'] = pic
     item['intro'] = ''.join(intro.split())
     item['feature'] = feature
     item['tags'] = None
     item['name'] = name
     yield item