Beispiel #1
0
 def parse_items(self, response):
     hxm = HtmlXPathSelector(response)
     titles = hxs.select("//p")
     items = []
     for titles in titles:
         item = CraigslistSampleItem()
         item ["title"] = titles.select("a/text()").extract()
         item ["link"] = titles.select("a/@href()").extract()
         item.append(item)
     return items
     print items
Beispiel #2
0
    def parse(self, response):

        hxs = HtmlXPathSelector(response)

        titles = hxs.xpath('//p[@class="row"]')

        items = []
        for title in titles:
            # structure of spider
            # <p class="row" data-pid="4947366935"> <a href="/cto/4947366935.html" class="i"
            # data-ids="0:00d0d_45lNsxzKpKi"><span class="price">&#x0024;2500</span></a> <span class="txt">
            # <span class="star"></span> <span class="pl"> <time datetime="2015-03-24 16:25" title="Tue 24 Mar
            # 04:25:35 PM (7 minutes ago)">Mar 24</time> <a href="/cto/4947366935.html" data-id="4947366935"
            # data-repost-of="3855891772" class="hdrlnk">1995 Toyota Camry</a> </span> <span class="l2">
            # <span class="price">&#x0024;2500</span> <span class="pnr"> <small> (Chapel Hill)</small>
            # <span class="px"> <span class="p"> pic</span></span> </span> </span> </span> </p>

            #timePost,price,carlink,abstractInformation,mileagel,modle,caryear
            item = CraigslistSampleItem()
            L = title.xpath(".//a/@href").extract()
            if len(L) > 0:
                item["carlink"] = "http://raleigh.craigslist.org/" + L.pop(
                ).encode('ascii', 'ignore')
            else:
                item["carlink"] = "unknown"

            item["abstractInformation"] = (list(
                title.xpath(".//a[@class='hdrlnk']/text()").extract()).pop()
                                           ).encode('ascii', 'ignore')
            item["timePost"] = (list(
                title.xpath(".//span[@class='pl']/time/@datetime").extract()).
                                pop()).encode('ascii', 'ignore')

            L = title.xpath(".//span[@class='price'][1]/text()").extract()
            if len(L) > 0:
                item["price"] = (L.pop()).encode('ascii', 'ignore')
            else:
                item["price"] = '0'
            item["price"] = int(item["price"].replace('$', ''))

            if item["price"] >= self.minPrice and item[
                    "price"] <= self.maxPrice and self.model in item[
                        "abstractInformation"].lower():
                try:
                    milageandy = getMilageAndYear(item["carlink"])
                    item["mileagel"] = milageandy['milage']
                    item["caryear"] = milageandy['year']
                except:
                    item["mileagel"] = '0'
                    item["caryear"] = '0'

                if item["timePost"] < self.ctime:
                    raise CloseSpider('bandwidth_exceeded')

                if item["caryear"] >= self.starttime and item[
                        "caryear"] <= self.endtime:
                    items.append(item)

        return items
Beispiel #3
0
 def parse(self, response):
     titles = response.selector.xpath("//p")
     items = []
     for titles in titles:
         item = CraigslistSampleItem()
         item["title"] = titles.select("a/text()").extract()
         item["link"] = titles.select("a/@href").extract()
         items.append(item)
     return items
Beispiel #4
0
 def parse_items(self, response):
     hxs = HtmlXPathSelector(response)
     titles = hxs.xpath('//span[@class="pl"]')
     items = []
     for titles in titles:
         item = CraigslistSampleItem()
         item["title"] = titles.xpath("a/text()").extract()
         item["link"] = titles.xpath("a/@href").extract()
         items.append(item)
     return(items)
Beispiel #5
0
 def parse(selfself, response):
     hxs = HtmlXPathSelector(response)
     titles = hxs.select("//p")
     items = []
     for titles in titles:
         item = CraigslistSampleItem()
         item["title"] = titles.select("a/text()").extract()
         item["link"] = titles.select("a/@href").extract()
         items.append(item)
     return items
Beispiel #6
0
	def parse(self,response):
		hxs=HtmlXPathSelector(response)
		titles=hxs.select('//span[@class="pl"]')
		items=[]
		for titles in titles:
			item = CraigslistSampleItem()
			item['title']=titles.select("a/text()").extract()
			item['link']=titles.select("a/@href").extract()
			iemts.append(item)
		return items
Beispiel #7
0
 def parse(self, response):
     hxs = Selector(response)
     titles = hxs.xpath("//span[@class='pl']")
     items = []
     for titles in titles:
         item = CraigslistSampleItem()
         item ["title"] = titles.select("a/text()").extract()
         item ["link"] = titles.select("a/@href").extract()
         items.append(item)
     return items
Beispiel #8
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     titles = hxs.xpath("//span[@class='pl']")
     items = []
     for title in titles:
         item = CraigslistSampleItem()
         item["title"] = title.select(
             'a/span[@id="titletextonly"]/text()').extract()
         item["link"] = title.select("a/@href").extract()
         items.append(item)
     return items
    def parse(self, response):
        items = []
        hxs = scrapy.selector.HtmlXPathSelector(response)
        titles = hxs.select("//p[@class='result-info']")

        for t in titles:
            item = CraigslistSampleItem()
            item["title"] = t.select("a/text()").extract()
            item["link"] = t.select("a/@href").extract()
            items.append(item)
        return items
Beispiel #10
0
 def parse_items(self, response):
     hxs = HtmlXPathSelector(response)
     titles = hxs.xpath('//a[@class="title"]')
     print titles + "ddddddddd"
     items = []
     for title in titles:
         item = CraigslistSampleItem()
         item["title"] = title.select(
             '//a[@class="title"]/text()').extract()
         #item["link"] = title.xpath("//a@href").extract()
         items.append(item)
     return (items)
Beispiel #11
0
    def parse(self, response):
        things = Selector(response).xpath(
            "/html/body/section/section/main/section")

        for thing in things:
            item = CraigslistSampleItem()
            item['title'] = thing.xpath(
                '//*[@id="post-853"]/header/h1/a/text()').extract()[0]
            item['post'] = thing.xpath(
                '//*[@id="post-853"]/div/p/text()').extract()[0]

            yield item
Beispiel #12
0
    def parse(self, response):
        hxs = Selector(response)
        titles = hxs.xpath("//div[@class='neighborbox-table']/table/tbody")
        items = []
        for title in titles:

            item = CraigslistSampleItem()
            item["title"] = title.xpath("/tr/th/text()").extract()
            item["link"] = title.xpath("/tr/td/text()").extract()
            print "test--------------------"
            # print "********title is:********" + item["title"],"\n------link is: ------"+item["link"]
            items.append(item)
            return items
Beispiel #13
0
 def parse(self, response):
   hxs = HtmlXPathSelector(response)
   titles = hxs.select("//span[@class='pl']")
   date_info = hxs.select("//h4[@class='ban']/span[@class='bantext']/text()")
   items = []
   file_to = open("things.txt","a")
   file_to.write(response.body)
   for titles in titles:
       item = CraigslistSampleItem()
       item ["title"] = titles.select("a/text()").extract()
       item ["link"] = titles.select("a/@href").extract()
       item ["date"] = date_info.extract()
       items.append(item)
   return items
Beispiel #14
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        rows = hxs.select('//div[@class="content"]/p[@class="row"]')

        items = []
        for row in rows:
            item = CraigslistSampleItem()
            link = row.xpath('.//span[@class="pl"]/a')
            item['date'] = row.xpath(
                './/span[@class="pl"]/span[@class="date"]/text()').extract()
            item['title'] = link.xpath("text()").extract()
            item['link'] = link.xpath("@href").extract()
            item['price'] = row.xpath(
                './/span[@class="l2"]/span[@class="price"]/text()').extract()
            items.append(item)
        return items
Beispiel #15
0
    def parsee(self, response):
        profiles = response.xpath(
            '//li[@class="listing PROFESSOR"]/a/@href').extract()
        print "the list of it ", len(profiles)
        # for each of those links
        for profile in profiles:
            # define item
            professor = CraigslistSampleItem()

            # add profile to professor
            professor["profile"] = profile

            # pass each page to the parse_profile() method
            request = scrapy.Request("http://www.ratemyprofessors.com" +
                                     profile,
                                     callback=self.parser_profile)
            request.meta["professor"] = professor

            # add professor to array of professors
            yield request
Beispiel #16
0
    def parse(self, response):
        soup = BeautifulSoup(response.body, 'html.parser')
        # print(soup.prettify())
        entries = soup.select('span.pl')
        items = []
        for entry in entries:
            item = CraigslistSampleItem()
            item["title"] = entry.find('span', attrs={
                'id': 'titletextonly'
            }).get_text().encode("utf-8")
            item["link"] = 'http://sfbay.craigslist.org' + entry.find('a').get(
                'href').encode("utf-8")
            items.append(item)
            print items


# var str = '\xe2\x9d\x80Bee Part of the Solution\xe2\x9d\x80Work W/Environment CA\xe2\x9d\x80 $10-15/hr';
#
# function decode_utf8(s) {
#   return decodeURIComponent(escape(s));
# }
#
# decode_utf8(str);