Ejemplo n.º 1
0
 def parse(self, response):
     i = 0
     for item in response.xpath("/html/body/div[2]/div[2]/ul/li/a"):
         i = i+1
         movClass = item.xpath("text()").extract()
         movUrl = item.xpath("@href").extract_first()
         oneItem = ExampleItem()
         oneItem['movClass'] = movClass
         oneItem['movUrl'] = movUrl
         for j in range(1,2):
             if j==1:
                 mvUrl2 = movUrl+str('index.html')
             else:
                 mvUrl2 = movUrl+str('index_%s.html'%j)
             try:
                 # print("++++++++++"+mvUrl2)
                 # yield oneItem
                 # yield scrapy.Request(url=mvUrl2,callback=lambda response,mvclass=movClass: self.parse_url(response,mvclass))
                 yield scrapy.Request(url=mvUrl2,callback=self.parse_url())
             except Exception as error:
                 print("-------------")
                 print(error)
                 pass
             except RuntimeError as error:
                 print("******************")
                 print(error)
         if i>2:
             break
Ejemplo n.º 2
0
 def parse_item(self, response):
     item = ExampleItem()
     item['name'] = response.css(
         'tr#places_country__row td.w2p_fw::text').extract()
     item['population'] = response.css(
         'tr#places_population__row td.w2p_fw::text').extract()
     return item
Ejemplo n.º 3
0
    def parse_directory(self, response):
        item = ExampleItem()
        url = response.url
        jid = self.md5(url)

        title = response.xpath(r'//h1/text()').extract()[0]
        location = response.xpath(
            r'//div/span[@class="lname"]/text()').extract()[0]
        exp = response.xpath(
            r'//div[@class="t1"]/span[1]/text()').extract()[0].strip('年经验')
        if '无工作' in exp:
            exp = 0
        else:
            exp = exp.strip('-')[0]
        degree = response.xpath(
            r'//div[@class="t1"]/span[2]/text()').extract()[0]
        crawled = datetime.datetime.now().strftime('%Y-%m-%d')
        money = response.xpath(r'//div[@class="cn"]/strong/text()').extract()
        maxmoney = minmoney = 0
        if money:
            money = money[0]
        else:
            pass
        if '/月' in money:
            money = money.strip('/月')
            money = money.split('-')
            if 'k' in money[1]:
                maxmoney = float(money[1].strip('k')) * 1000
                minmoney = float(money[0]) * 1000
            elif '千' in money[1]:
                maxmoney = float(money[1].strip('千')) * 1000
                minmoney = float(money[0]) * 1000
            elif '万' in money[1]:
                maxmoney = float(money[1].strip('万')) * 10000
                minmoney = float(money[0]) * 10000
        elif '万/年' in money:
            money = money.strip('/年')
            money = money.split('-')
            if '万' in money[1]:
                maxmoney = float(money[1].strip('万')) * 1000
                minmoney = float(money[0]) * 1000
        elif '以上' in money:
            minmoney = maxmoney = money.strip('元/月以上')
        elif '面议' in money:
            minmoney = maxmoney = 0
        elif '以下' in money:
            minmoney = maxmoney = int(money.strip('元/月以下'))

        item['title'] = title
        item['maxmoney'] = int(maxmoney)
        item['minmoney'] = int(minmoney)
        item['crawled'] = crawled
        item['location'] = location
        item['exp'] = exp
        item['degree'] = degree
        item['url'] = url
        item['jid'] = jid

        print(title, location, exp, degree, url)
        yield item
Ejemplo n.º 4
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     titles = hxs.select("//p")
     items = []
     for titles in titles:
         item = ExampleItem()
         item['year'] = titles.select("span/text()").extract()[0]
         item['title'] = titles.select("a/b/text()").extract()[0]
         yield item
Ejemplo n.º 5
0
 def parse(self, response):
     print "s%d" % self.count,
     self.count += 1
     news_item = ExampleItem()
     items = response.xpath("//div[@id='list']/a")
     for item in items:
         news_item["news_link"] = item.xpath("@href").extract()[0]
         news_item["news_title"] = item.xpath("span/text()").extract()[0]
         news_item["news_time"] = item.xpath("font/text()").extract()[0]
         yield news_item
Ejemplo n.º 6
0
 def parse_item(self, response):
     item = ExampleItem()
     name_css = 'tr#places_country__row td.w2p_fw::text'
     item['name'] = response.css(name_css).extract()
     pop_css = 'tr#places_population__row td.w2p_fw::text'
     item['population'] = response.css(pop_css).extract()
     #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     #i['name'] = response.xpath('//div[@id="name"]').extract()
     #i['description'] = response.xpath('//div[@id="description"]').extract()
     return item
Ejemplo n.º 7
0
    def parse_book(self, response):
        book = ExampleItem()
        table = response.xpath('//article/table')
        book['name'] = response.xpath('//article//h1/text()').extract_first()
        book['price'] = table.xpath('//tr[3]/td/text()').extract_first()
        book['availability'] = table.xpath('//tr[6]/td/text()').re_first(
            '(\d+)')
        book['review_num'] = table.xpath('//tr[7]/td/text()').extract_first()

        yield book
Ejemplo n.º 8
0
 def parse(self, response):
     item = ExampleItem()
     for box in response.xpath(
             '//div[dl[@class="f-list-item-wrap f-clear"]]'):
         #获取每个div中的课程路径
         item['url'] = box.xpath('./a[@title]/text()')
         #获取div中的课程标题
         #item['title'] = box.xpath('.//a[@class="js-title value title-font"]/text()').extract()[0].strip()
         #获取div中的价格
         #item['price'] = box.xpath('//div[1]/span[1]/text()')
         yield item
Ejemplo n.º 9
0
 def parseJobDetail(self, response):
     print(response.status)
     item = ExampleItem()
     # 目标数据:职位名称、工作地点、职位类别、工作职责、工作要求
     item['jobName'] = response.xpath('//td[@id="sharetitle"]/text()').extract()[0]
     item['workLocation'] = response.xpath('//tr[@class="c bottomline"]/td[1]/text()').extract()[0]
     item['jobType'] = response.xpath('//tr[@class="c bottomline"]/td[2]/text()').extract()[0]
     item['jobDesc'] = response.xpath('//table[@class="tablelist textl"]/tr[3]//li/text()').extract()
     item['jobInfo'] = response.xpath('//table[@class="tablelist textl"]/tr[4]//li/text()').extract()
     # print(jobName,workLocation,jobType,jobDesc,jobInfo)
     yield item
Ejemplo n.º 10
0
 def parse(self, response):
     print 'c%d' % self.count,
     self.count += 1
     news_item = ExampleItem()
     items = response.xpath("//ul[@class='news-list']/li")
     for item in items:
         news_item["news_link"] = item.xpath("a/@href").extract()[0]
         news_item["news_title"] = item.xpath("a/text()").extract()[0]
         news_item["news_time"] = item.xpath(
             "span[@class='time']/text()").extract()[0]
         yield news_item
Ejemplo n.º 11
0
    def parse_directory(self, response):
        print(response.status)

        list = response.xpath('//div[@class="post floated-thumb"]')
        for node in list:
            item = ExampleItem()
            item['name'] = node.xpath('//div[@class="post-thumb"]/a/@title').extract_first()

            item['url'] = node.xpath('//div[@class="post-thumb"]/a/@href').extract_first()

            yield item
    def parse_blog(self, response):
        hxs = HtmlXPathSelector(response)
        item = ExampleItem()

        item['title'] = hxs.select(
            "//div[@class='span9']/h1/a/text()").extract()
        item['description'] = hxs.select(
            "//div[@class='span9']/p[@class='lead']/text()").extract()
        item['date'] = hxs.select(
            "//div[@class='span9']/h1/small/text()").extract()
        item['post'] = hxs.select("//div[@class='span9']/p/text()").extract()
        return item
Ejemplo n.º 13
0
    def parse_directory(self, response):
        item = ExampleItem()
        url = response.url
        jid = self.md5(url)
        title = response.xpath('//h1/text()').extract()[0]
        money = response.xpath(
            '//ul[@class="clearfix pos-relat"]/li[2]/em/text()').extract()[0]
        minmoney = maxmoney = 0
        if '-' in money:
            money = money.split('-')
            minmoney = int(money[0])
            maxmoney = int(money[1].strip('元'))
        elif '面议' in money:
            pass
        elif '元以上' in money:
            minmoney = maxmoney = money.strip('元以上')
        elif '元以下' in money:
            minmoney = maxmoney = int(money.strip('元以下'))
        degree = response.xpath(
            '//ul[@class="clearfix pos-relat"]/li[3]/em/text()').extract()[0]
        location = response.xpath(
            '//ul[@class="clearfix pos-relat"]/li[8]/em/text()|//ul[@class="clearfix pos-relat"]/li[7]/em/text()'
        ).extract()[0].strip('                ')
        # if location:
        #     location = location
        # else:
        #     location = response.xpath('//ul[@class="clearfix pos-relat"]/li[7]/em/text()').extract()[0]
        crawled = response.xpath('//p[@class="data-sty mb-5"]/span[1]/text()'
                                 ).extract()[0].strip('更新时间:')
        crawled = self.Strfdate(crawled)
        exp = response.xpath(
            '//ul[@class="clearfix pos-relat"]/li[4]/em/text()').extract()[0]
        p = re.compile(r'(\d+)')
        exp = p.search(exp)
        if exp:
            exp = exp.group(1)
        else:
            exp = 0
        # item['item'] =money
        item["title"] = title
        item["maxmoney"] = maxmoney
        item["minmoney"] = minmoney
        item["location"] = location
        item["crawled"] = crawled
        item["exp"] = exp
        item["degree"] = degree
        item["url"] = url
        item["jid"] = jid

        yield item
Ejemplo n.º 14
0
    def parse(self, response):

        sel = Selector(response)
        sites = sel.xpath('/html/body/div[4]/div/div[2]/div[1]/ul/li')
        #url = 'http://www.hbnu.edu.cn/'
        item = ExampleItem()
        url = []
        item['title'] = '一级链接'
        for site in sites:
            item['url'] = site.xpath('a/@href').extract()
            url.append(item)
            yield (item)
        #print(url[2]['url'])
        yield scrapy.Request(url[2]['url'][0], callback=self.next)
Ejemplo n.º 15
0
    def parse_directory(self, response):
        item = ExampleItem()
        url = response.url
        jid = self.md5(url)

        location = response.xpath(
            '//div[@class="pos-area"]/span/span[1]/text()').extract()[0]
        exp = response.xpath(
            '//div[@class="pos_base_condition"]/span[3]/text()').extract()[0]
        p = re.compile(r'(\d+)')
        exp = p.search(exp)
        if exp:
            exp = exp.group(1)
        else:
            exp = 0
        date_pub = response.xpath(
            '//span[@class="pos_base_num pos_base_update"]/span/text()'
        ).extract()[0]
        crawled = self.Strfdate(date_pub)

        degree = response.xpath(
            '//span[@class="item_condition"]/text()').extract()[0]
        title = response.xpath(
            '//div[@class="pos_base_info"]/span[1]/text()').extract()[0]
        money = response.xpath('//div[@class="pos_base_info"]/span[2]/text()'
                               ).extract()[0].strip('元/月\xa0')
        minmoney = maxmoney = 0
        if '-' in money:
            money = money.split('-')
            minmoney = int(money[0])
            maxmoney = int(money[1])
        elif '面议' in money:
            pass
        elif '以上' in money:
            minmoney = maxmoney = money.strip('元/月以上')
        elif '以下' in money:
            minmoney = maxmoney = int(money.strip('元/月以下'))

        item['url'] = url
        item['jid'] = jid
        item['title'] = title
        item['location'] = location
        item['exp'] = exp
        item['degree'] = degree
        item['maxmoney'] = maxmoney
        item['minmoney'] = minmoney
        item['crawled'] = crawled
        yield item
    def parse(self, response):
        l = ExampleItem()
        # Judge whether it's a not html file
        if self.isNoHtmlFile(response) is True:
            # Treat it like not html
            self.saveItToFile(self.debug_nohtml_cache_dir, response)
        else:
            # Treat it like html
            # According to the rules to judge whether it has the keywords, then save the file in directory
            if self.htmlParseRulesSelector(response) is True:
                self.saveItToFile(self.debug_dir, response)
            l['name'] = response.css('title::text').get()
            self.get_urls_store_redis(response)

        l['url'] = response.url
        yield l
Ejemplo n.º 17
0
    def parse(self, response):
        item = ExampleItem()

        item['name'] = response.xpath(
            '//article[@class="product_pod"]/h3/a/text()').extract()
        print(item['name'])
        item['price'] = response.xpath(
            '//div[@class="product_price"]/p[@class="price_color"]/text()'
        ).extract()
        print(item['price'])
        #提取完数据后返回item
        yield item

        for i in range(1, 50):
            url = 'http://books.toscrape.com/catalogue/page-' + str(
                i) + '.html'
            yield Request(url, callback=self.parse)
Ejemplo n.º 18
0
    def parse_directory(self, response):
        item = ExampleItem()
        url = response.url
        jid = self.md5(url)
        # jid = re.compile(r'https://www.lagou.com/jobs/(\d+)')
        # jid = jid.search(url)
        # if jid :
        #     jid = jid.group(1)
        location = response.xpath('//dd[@class="job_request"]/p/span[2]/text()'
                                  ).extract()[0].strip('/').strip(' /')
        exp = response.xpath('//dd[@class="job_request"]/p/span[3]/text()'
                             ).extract()[0].strip(' /').strip('经验')
        if '年' in exp:
            exp = exp.strip('年').strip('-')[0]

        else:
            exp = 0

        degree = response.xpath('//dd[@class="job_request"]/p/span[4]/text()'
                                ).extract()[0].strip(' /')
        crawled = datetime.datetime.now().strftime('%Y-%m-%d')
        title = response.xpath('//span[@class="name"]/text()').extract()[0]
        money = response.xpath('//dd[@class="job_request"]/p/span[1]/text()'
                               ).extract()[0].split('-')
        maxmoney = minmoney = 0
        if money:
            money = money
        else:
            pass
        if 'k' in money[0]:
            minmoney = int(money[0].strip('k')) * 1000
            maxmoney = int(money[1].strip('k ')) * 1000

        # 加载数据
        item['url'] = url
        item['jid'] = jid
        item['title'] = title
        item['location'] = location
        item['exp'] = exp
        item['degree'] = degree
        item['maxmoney'] = maxmoney
        item['minmoney'] = minmoney
        # item['money'] = money
        item['crawled'] = crawled
        yield item
Ejemplo n.º 19
0
    def parse_directory(self, response):
        node_all = response.xpath('//ul[@class="boxbdnopd"]/li')

        for node in node_all:
            item = ExampleItem()
            name = node.xpath('.//h4/a/text()').extract()[0]

            link = node.xpath('.//h4/a/@href').extract()[0]

            description = node.xpath(
                './/p[@class="description"]/text()').extract()[0]

            item["name"] = name
            item["link"] = link
            item["description"] = description

            print("link==", link)
            yield item
Ejemplo n.º 20
0
    def parse(self, response):

        list = response.xpath('//div[@class="post floated-thumb"]')

        for node in list:
            item = ExampleItem()
            item['name'] = node.xpath(
                './/div[@class="post-thumb"]/a/@title').extract_first()
            item['detailurl'] = node.xpath(
                './/div[@class="post-thumb"]/a/@href').extract_first()
            yield item

        #取到当前页面下其他页面的url
        next_urls = response.xpath(
            '//a[@class="page-numbers"]/@href').extract()
        print(next_urls)
        for url in next_urls:
            yield Request(url=url, callback=self.parse)
Ejemplo n.º 21
0
    def parse(self, response):

        with open('data.json', 'wb') as f:
            f.write(response.body)

        item = ExampleItem()
        '''print('aaaaaa')
        item['next_page'] = response.css('div.pageBox')[1].xpath('.//a[@class="next"]/@href').extract_first()'''
        yield item
        for box in response.xpath(
                '//div[@class="f-list-item ershoufang-list"]'):
            #获取每个div中的课程路径
            #item['url'] = box.xpath('./@href')
            #//*[@id="puid-2998490926"]/dl/dd[1]/a
            #获取div中的课程标题
            item['title'] = box.xpath(
                './/dd[@class="dd-item title"]/a[@class="js-title value title-font"]/text()'
            ).extract()[0]
            #获取div中的价格
            item['price'] = box.xpath(
                './/dd[@class="dd-item info"]/div[@class="price"]/span[@class="num"]/text()'
            ).extract()[0]
            #获取房屋面积
            item['area'] = box.xpath(
                './/dl[@class="f-list-item-wrap f-clear"]/dd[@class="dd-item size"]/span[5]/text()'
            ).extract()[0]
            #房屋租赁方式
            item['rent'] = box.xpath(
                './/dl[@class="f-list-item-wrap f-clear"]/dd[@class="dd-item size"]/span[@class="first js-huxing"]/text()'
            ).extract()[0]

            yield item

        next_page = 'http://bj.ganji.com' + response.css('div.pageBox')[
            1].xpath('.//a[@class="next"]/@href').extract_first()
        #print(next_page)
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page,
                                 callback=self.parse,
                                 headers={'referer': next_page})
Ejemplo n.º 22
0
    def parse_directory(self, response):
        item = ExampleItem()
        url = response.url.strip()
        jid = self.md5(url)
        location = response.xpath(r'//ul/li[2]/strong/a/text()').extract()[0]
        exp = response.xpath(r'//ul/li[5]/strong/text()').extract()[0].strip(
            '年')
        if '不限' in exp:
            exp = 0
        else:
            exp = exp.strip('-')[0]
        degree = response.xpath(r'//ul/li[6]/strong/text()').extract()[0]
        crawled = datetime.datetime.now().strftime('%Y-%m-%d')
        title = response.xpath(
            r'//div[@class="inner-left fl"]/h1/text()').extract()[0].strip()
        money = response.xpath(r'//ul/li[1]/strong/text()').extract()[0].strip(
            '元/月\xa0')
        if '以上' in money:
            minmoney = maxmoney = money.strip('元/月以上')
        elif '面议' in money:
            minmoney = maxmoney = 0
        elif '以下' in money:
            minmoney = maxmoney = money.strip('元/月以下')
        else:
            money = money.split('-')
            maxmoney = money[1].strip('元/月')
            minmoney = money[0]

        item['jid'] = jid
        item['title'] = title
        item['maxmoney'] = maxmoney
        item['minmoney'] = minmoney
        item['crawled'] = crawled
        item['location'] = location
        item['exp'] = exp
        item['degree'] = degree
        item['url'] = url

        yield item
Ejemplo n.º 23
0
    def parse_page(self, response):
        item = ExampleItem()
        item['url'] = response.url
        item['title'] = response.xpath(
            '/html/head/title/text()').extract_first()
        m = re.match(r'(\d+)年(\d+)月份(\S+)\S车\(分制造商\)销量', item['title'])
        if m is not None:

            year = m.group(1)
            month = m.group(2)
            country = m.group(3)
            conn = pymysql.Connect(host='127.0.0.1',
                                   port=3306,
                                   user='******',
                                   passwd='123456',
                                   db='car',
                                   charset='utf8')
            trs2 = response.xpath(
                '//div[@class="newstext"]/table/tbody/tr/td[2]/font')
            trs = response.xpath(
                '//div[@class="newstext"]/table/tbody/tr/td[1]/font')
            for tr in trs:
                i = trs.index(tr)
                logo = tr.xpath('string(.)').extract()[0]
                num = trs2[i].xpath('string(.)').extract()[0]
                sql = "insert into sale values('%s','%s','%s','%s','%s')" % (
                    year, month, country, logo, num)
                cursor = conn.cursor()
                try:
                    cursor.execute(sql)
                except:
                    yield {'insert error': item['url']}
                cursor.close()
                yield {'logo': logo, 'num': num}
            yield {'year': year, 'month': month, 'country': country}
            conn.commit()
            conn.close()
        yield {'url': item['url'], 'name': item['title']}
        return item
Ejemplo n.º 24
0
    def parse_mor(self, response,mvsclass,img,name,mvUrl):
        for select in response.xpath('//div[@class="contentinfo"]'):
            mvdownloadUrl = select.xpath("div/table/tbody/.//tr/td/a/@href").extract()  # 下载地址,可能是多个
            mvdtilte = select.xpath("div/table/tbody/.//tr/td/a/text()").extract()#下载标签的文本
            mvdesc = select.xpath("div[@id='text']/.//p/text()")#/p[2]/text()
            desc = ''
            for p in mvdesc:
                desc = desc+p.extract().strip()

            # desc= str(desc).replace('\\u3000','  ')
            mvdownloadUrl = ";".join(mvdownloadUrl)
            Item = ExampleItem()
            Item['movClass'] = mvsclass
            Item['downLoadName'] = name
            if str(mvdtilte).strip()=='':
                mvdtilte = "点击下载"
            Item['downdtitle'] = str(mvdtilte)
            Item['downimgurl'] = img
            Item['downLoadUrl'] = mvdownloadUrl
            Item['mvdesc'] = desc
            Item['mvUrl'] = mvUrl
            yield Item
        pass
Ejemplo n.º 25
0
    def parse(self, response):
        try:
            data_dict = json.loads(response.body)
        except Exception as e:
            print(response.body)
            print(response.status)
            print(e)
            return
        self.is_ok = True
        fares = data_dict.get('fares')
        journeys = jsonpath(data_dict, '$..journeys')[0]
        for journey in journeys:
            segments = journey.get('segments')
            if len(segments) > 1:
                continue
            journey_key = journey.get('journeySellKey')
            important = filter(lambda x: x, re.split(r'~[~|\s]*', journey_key))
            carrier, fn_no, dep_port, dep_date, arr_port, arr_date = important
            flight_number = carrier + fn_no
            dep_time = time.mktime(time.strptime(dep_date, '%m/%d/%Y %H:%M'))
            arr_time = time.mktime(time.strptime(arr_date, '%m/%d/%Y %H:%M'))
            available_fares = jsonpath(segments, '$..availableFares')[0]
            dep_city = self.port_city.get(dep_port, dep_port)
            arr_city = self.port_city.get(arr_port, arr_port)

            # 获取有座位的最低价和套餐价格
            fare_index_low = None
            dif_pack = []
            seats = 0
            for i, available_fare in enumerate(available_fares):
                fare_index_temp = available_fare.get('fareIndex')
                if fare_index_low is None:
                    fare_index_low = fare_index_temp
                    seats_str = available_fare.get('availableCount')
                    seats = 9 if seats_str == 32767 else seats_str
                if i:
                    fare_temp = fares[fare_index_temp]
                    price_temp = jsonpath(fare_temp, '$..amount')[0]
                    seats_temp_str = available_fare.get('availableCount')
                    seats_temp = 9 if seats_temp_str == 32767 else seats_temp_str
                    dif_pack.append([price_temp, seats_temp])

            fare = fares[fare_index_low]
            cabin = fare.get('classOfService')
            price = jsonpath(fare, '$..amount')[0]
            net_fare = price
            adult_tax = 0
            currency = jsonpath(fare, '$..currencyCode')[0]

            item = ExampleItem()
            item.update(
                dict(
                    flight_number=flight_number,
                    dep_time=dep_time,
                    arr_time=arr_time,
                    dep_port=dep_port,
                    arr_port=arr_port,
                    currency=currency,
                    adult_price=price,
                    adult_tax=adult_tax,
                    net_fare=net_fare,
                    max_seats=seats,
                    cabin=cabin,
                    carrier=carrier,
                    is_change=1,
                    segments=json.dumps(dif_pack),
                    get_time=time.time(),
                    from_city=dep_city,
                    to_city=arr_city,
                    info='',
                ))
            yield item