コード例 #1
0
  def parse(self, response):
    hxs = HtmlXPathSelector(response)
    items = [] 

    for i in range(1, 11):
      url = hxs.select('//*[@id="search-results"]/li[' + str(i) + ']/h3/a/@href').extract()[0]
      
      if "/politics/" in url:
        item = NewsItem()
      #selector = '//li[@section=##]&*'
      #selector = selector.replace('##', str(i + 1))
        try:
          item['headline'] = hxs.select('//*[@id="search-results"]/li[' + str(i) + ']/h3/a/text()').extract()[0].replace("\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "")
        except IndexError:
          {}
      
        try:
          item['imgsrc'] = hxs.select('//*[@id="search-results"]/li[' + str(i) + ']/a/img/@src').extract()[0].replace("145x100", "606w")
          #fd = cStringIO.StringIO(urllib.urlopen(item['imgsrc']).read())
          #im = Image.open(fd)
          item['imgSize'] = "606x406"
        except IndexError:
          {}
        try:
          item['url'] = url
        except IndexError:
          {}
        try:
          item['date'] = hxs.select('//*[@id="search-results"]/li[' + str(i) + ']/cite/span/text()').extract()[0]
        except IndexError:
          {}
        item['src'] = u'The Washington Post'
      
        items.append(item)
    return items
コード例 #2
0
ファイル: mem.py プロジェクト: apenugon/MediaBiasScraping
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        stuff = hxs.select(
            "/html/body/table/tbody/tr/td[3]/div/a/text()").extract()
        for i in stuff:
            item = NewsItem()
            item['headline'] = i
            items.append(item)

        return items
コード例 #3
0
    def parse1(self, response):

        hxs = HtmlXPathSelector(response)
        #/div[@class='clearfix']")
        item = NewsItem()
        if len(
                hxs.select(
                    '//*[@id="mod-article-header"]/h1/text()').extract()) > 0:
            #selector = '//li[@section=##]&*'
            #selector = selector.replace('##', str(i + 1))
            try:
                item['headline'] = hxs.select(
                    '//*[@id="mod-article-header"]/h1/text()').extract()[0]
            except IndexError:
                {}
            try:
                item['url'] = response.url
            except IndexError:
                {}
            try:
                curdate = hxs.select('//*[@id="mod-article-byline"]/span[3]' /
                                     text()).extract()[0]
                item['date'] = re.sub('<[^<]+?>', '', curdate)
            except IndexError:
                {}
            item['src'] = u'Times of India'
        else:
            #selector = '//li[@section=##]&*'
            #selector = selector.replace('##', str(i + 1))
            try:
                item['headline'] = hxs.select(
                    '//*[@id="netspidersosh"]/div[1]/div/div[11]/div[1]/span[1]/h1/text()'
                ).extract()[0]
            except IndexError:
                {}

            try:
                item['url'] = response.url
            except IndexError:
                {}
            try:
                item['imgsrc'] = hxs.select(
                    '//*[@id="bellyad"]/div/div[1]/img/@src').extract()[0]
                item['imgSize'] = "300x209"
            except IndexError:
                {}
            try:
                item['date'] = hxs.select(
                    '/html/head/meta[1]/@content').extract()[0]
            except IndexError:
                {}
            item['src'] = u'Times of India'
            return item
コード例 #4
0
ファイル: abc.py プロジェクト: apenugon/MediaBiasScraping
 def parse(self, response):
   hxs = HtmlXPathSelector(response)
   items = []
   articles = hxs.select("//div[@class='result WireStory']")
   for i in articles:
     item = NewsItem()
     item['headline'] = i.select("./a[@class='title']/text()").extract()[0]
     item['url'] = i.select("./a[@class='title']/@href").extract()[0]
     item['date'] = i.select(".//span[@class='date']/text()").extract()[0]
     item['src'] = u'abc'
     try:
       item['imgsrc'] = i.select(".//img/@src").extract()[0].replace("_mw", "")
       item['imgSize'] = "512x329"
     except IndexError:
       {}
     items.append(item);
   return items
コード例 #5
0
 def parse1(self, response):
   hxs = HtmlXPathSelector(response)
   item = NewsItem()
   item['headline'] = hxs.select('//*[@id="fixed-header-progress-bar-container"]/span[1]/strong/text()').extract()[0]
   item['url'] = response.url
   item['date'] = hxs.select('//*[@id="title-progress-bar-container"]/span[2]/text()').extract()[0]
   item['src'] = u"New Republic"
   try: #if it's a newer article
     item['imgsrc'] = u'www.newrepublic.com' + hxs.select('//*[@class="legacy-image pull-right"]/img/@src').extract()[0]
     item['imgSize'] = u"242x242"
   except IndexError:
     try: #maybe it's an old article?
       item['imgsrc'] = hxs.select('/html/head/meta[12]/@content').extract()[0]
       item['imgSize'] = u"1250x517"
     except IndexError: #no image
       {}
   return item
コード例 #6
0
ファイル: sfc.py プロジェクト: apenugon/MediaBiasScraping
    def parse1(self, response):
        hxs = HtmlXPathSelector(response)
        item = NewsItem()
        item['headline'] = hxs.select(
            '//*[@id="ogtitle"]/@content').extract()[0]
        item['url'] = response.url
        item['date'] = hxs.select(
            '//*[@id="article_header"]/div[1]/div[4]/text()').extract()[0]
        try:
            item['imgsrc'] = hxs.select(
                '/html/body/div[2]/article/section/div[2]/div[1]/figure/img/@src'
            ).extract()[0]
            item['imgSize'] = '590x421'
        except IndexError:
            {}
        item['src'] = "Slate"

        return item
コード例 #7
0
ファイル: wsj.py プロジェクト: apenugon/MediaBiasScraping
  def parse(self, response):
    hxs = HtmlXPathSelector(response)
    items = []
    myStuff = hxs.select('/html/body/div[1]/div[2]/div/div[1]/div[3]/ul/li')
    
    for i in myStuff:
      item = NewsItem()
      item['url'] = i.select(".//a/@href").extract()[0]
      item['headline'] = i.select(".//h2/a/text()").extract()[0]
      item['date'] = i.select("./div/ul/li/text()").extract()[0]

      try:
        item['imgsrc'] = i.select("./div/div[@class='newsImage']/a/img/@src").extract()[0]
        item['imgSize'] = "640x360"
      except IndexError:
        {}
      item['src'] = "Wall Street Journal"
      items.append(item)

    return items
    """hxs = HtmlXPathSelector(response)
コード例 #8
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     items = []
     for i in range(15):
         item = NewsItem()
         selector = '//div[@class="tout"]&*'
         #selector = selector.replace('##', str(i + 1))
         try:
             item['headline'] = hxs.select(
                 "//div[@class='tout']/h3/a/text()").extract()[i]
             #item['headline'] = hxs.extract()
             #print hxs.extract()
         except IndexError:
             {}
         try:
             item['imgsrc'] = hxs.select(
                 selector.replace(
                     '&*',
                     '/div[@class="img"]/a/img/@src')).extract()[i].replace(
                         "?w=360", "")
             #fd = cStringIO.StringIO(urllib.urlopen(item['imgsrc']).read())
             #im = Image.open(fd)
             item['imgSize'] = "360x240"
         except IndexError:
             {}
         try:
             item['url'] = hxs.select(selector.replace(
                 '&*', '//h3/a/@href')).extract()[i]
         except IndexError:
             {}
         try:
             item['date'] = hxs.select(
                 selector.replace(
                     '&*', '//span[@class = "date"]/text()')).extract()[i]
         except IndexError:
             {}
         item['src'] = u'Time'
         items.append(item)
     return items
コード例 #9
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     items = []
     for i in range(30):
         item = NewsItem()
         selector = '//li[@section=##]&*'
         selector = selector.replace('##', str(i + 1))
         try:
             item['headline'] = hxs.select(
                 selector.replace(
                     '&*', '//a[@class="title"]/text()')).extract()[0]
         except IndexError:
             {}
         try:
             item['imgsrc'] = hxs.select(
                 selector.replace(
                     '&*', '//img[contains(@class, "storyImg")]/@src')
             ).extract()[0].replace("_75x56",
                                    "_640x480").replace("s.jpg", ".jpg")
             #fd = cStringIO.StringIO(urllib.urlopen(item['imgsrc']).read())
             #im = Image.open(fd)
             item['imgSize'] = "640x480"
         except IndexError:
             {}
         try:
             item['url'] = hxs.select(
                 selector.replace(
                     '&*', '//a[@class = "title"]/@href')).extract()[0]
         except IndexError:
             {}
         try:
             item['date'] = hxs.select(
                 selector.replace(
                     '&*', '//span[@class = "date"]/text()')).extract()[0]
         except IndexError:
             {}
         item['src'] = u'CBS'
         items.append(item)
     return items
コード例 #10
0
ファイル: nbc.py プロジェクト: apenugon/MediaBiasScraping
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        myStuff = hxs.select('//*[@id="vine-t"]/div[2]/div/div[2]/ul/li')

        for i in myStuff:
            item = NewsItem()
            #selector = '//li[@section=##]&*'
            #selector = selector.replace('##', str(i + 1))
            try:
                item['headline'] = i.select(
                    './article/header/h2/a/text()').extract()[0]
            except IndexError:
                {}

            try:
                item['imgsrc'] = i.select('.//img/@src').extract(
                )[0]  #fd = cStringIO.StringIO(urllib.urlopen(item['imgsrc']).read())
                #im = Image.open(fd)
                item['imgSize'] = "600x400"
            except IndexError:
                {}
            try:
                item['url'] = i.select(
                    './article/header/h2/a/@href').extract()[0]
            except IndexError:
                {}
            try:
                item['date'] = i.select(
                    './article/header/time/@datetime').extract()[0]
            except IndexError:
                {}
            item['src'] = u'NBC News'

            items.append(item)
        return items
コード例 #11
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        articleBlocks = hxs.select(
            "//section[@class='list list-blog-roll']/article[@class='story']")

        #/div[@class='clearfix']")
        for i in articleBlocks:
            item = NewsItem()
            if len(i.select(".//div[@class='clearfix']").extract()) > 0:
                i = i.select(".//div[@class='clearfix']")
                #selector = '//li[@section=##]&*'
                #selector = selector.replace('##', str(i + 1))
                try:
                    item['headline'] = i.select(
                        ".//div[@class='grid_9 omega']/h1/a/text()").extract(
                        )[0]
                except IndexError:
                    {}

                try:
                    item['imgsrc'] = i.select(
                        './/img/@src').extract()[0].replace("?w=145", "")
                    #fd = cStringIO.StringIO(urllib.urlopen(item['imgsrc']).read())
                    #im = Image.open(fd)
                    item['imgSize'] = "485x342"
                except IndexError:
                    {}
                try:
                    item['url'] = "www.breitbart.com" + i.select(
                        ".//div[@class='grid_3 alpha']/a/@href").extract()[0]
                except IndexError:
                    {}
                try:
                    item['date'] = i.select(
                        './/span[@class = "story-time"]/text()').extract()[0]
                except IndexError:
                    {}
                item['src'] = u'Breitbart'

                items.append(item)
            else:
                #selector = '//li[@section=##]&*'
                #selector = selector.replace('##', str(i + 1))
                try:
                    item['headline'] = i.select(".//h1/a/text()").extract()[0]
                except IndexError:
                    {}

                try:
                    item['url'] = "www.breitbart.com" + i.select(
                        ".//h1/a/@href").extract()[0]
                except IndexError:
                    {}
                try:
                    item['date'] = i.select(
                        './/span[@class = "story-time"]/text()').extract()[0]
                except IndexError:
                    {}
                item['src'] = u'Breitbart'

                items.append(item)

        return items