コード例 #1
0
ファイル: reporter.py プロジェクト: hl2533/reporter
	def read(self, url=None, html=None, soup=None, autocue=default_autocue):

		self.autocue = autocue

		if soup is not None:
			self.soup = soup
		else:
			if html is not None:
				print "making soup"
				try:
					self.soup = self._get_soup(html)
				except TypeError:
					return False
			else:
				if url is not None:
					html = requests.get(url).content
					try:
						self.soup = self._get_soup(html)
					except TypeError:
						return False
				else:
					self.soup = None
					
		print "making url"
		
		self.make_urls_absolute(self.soup, url)

		print "pre traversal"
		self.autocue.execute(self.soup, PRE_TRAVERSAL)

		# We work our way up the DOM
		for tag in reversed(self.soup.find_all()):

			if tag.name == 'p':
				evaluate_as = EVAL_PARAGRAPH
			else:
				# If tag contains text of its own, evaluate it as a paragraph
				evaluate_as = EVAL_CONTAINER
				for child in tag.children:
					if isinstance(child, NavigableString):
						text = unicode(child).strip()
						if len(text) > 10:
							evaluate_as = EVAL_PARAGRAPH
							continue

			self.autocue.execute(tag, evaluate_as)

		print "post traversal"
		self.autocue.execute(self.soup, POST_TRAVERSAL)
		print "max score"
		self.news_container = get_tag_with_max_score(self.soup)
		return True
コード例 #2
0
ファイル: reporter.py プロジェクト: AJRenold/reporter
  def read(self, url=None, html=None, soup=None, autocue=default_autocue):

    self.autocue = autocue
    self.last_url = ''

    if soup is not None:
      self.soup = soup
    else:
      if html is not None:
        try:
          self.soup = self._get_soup(html)
        except TypeError:
          return False
      else:
        if url is not None:
          r = requests.get(url)
          print r.headers['content-type']
          self.is_html = 'text/html' in r.headers['content-type']
          print 'read says', self.is_html
          html = r.content
          self.last_url = r.url

          try:
            self.soup = self._get_soup(html)
          except TypeError:
            return False
        else:
          self.soup = None
          
    self.make_urls_absolute(self.soup, url)
    self.autocue.execute(self.soup, PRE_TRAVERSAL)

    # We work our way up the DOM
    for tag in reversed(self.soup.find_all()):

      if tag.name == 'p':
        evaluate_as = EVAL_PARAGRAPH
      else:
        # If tag contains text of its own, evaluate it as a paragraph
        evaluate_as = EVAL_CONTAINER
        for child in tag.children:
          if isinstance(child, NavigableString):
            text = unicode(child).strip()
            if len(text) > 10:
              evaluate_as = EVAL_PARAGRAPH
              continue

      self.autocue.execute(tag, evaluate_as)

    self.autocue.execute(self.soup, POST_TRAVERSAL)
    self.news_container = get_tag_with_max_score(self.soup)
    return True
コード例 #3
0
ファイル: reporter.py プロジェクト: AJRenold/reporter
    def read(self, url=None, html=None, soup=None, autocue=default_autocue):

        self.autocue = autocue
        self.last_url = ''

        if soup is not None:
            self.soup = soup
        else:
            if html is not None:
                try:
                    self.soup = self._get_soup(html)
                except TypeError:
                    return False
            else:
                if url is not None:
                    r = requests.get(url)
                    print r.headers['content-type']
                    self.is_html = 'text/html' in r.headers['content-type']
                    print 'read says', self.is_html
                    html = r.content
                    self.last_url = r.url

                    try:
                        self.soup = self._get_soup(html)
                    except TypeError:
                        return False
                else:
                    self.soup = None

        self.make_urls_absolute(self.soup, url)
        self.autocue.execute(self.soup, PRE_TRAVERSAL)

        # We work our way up the DOM
        for tag in reversed(self.soup.find_all()):

            if tag.name == 'p':
                evaluate_as = EVAL_PARAGRAPH
            else:
                # If tag contains text of its own, evaluate it as a paragraph
                evaluate_as = EVAL_CONTAINER
                for child in tag.children:
                    if isinstance(child, NavigableString):
                        text = unicode(child).strip()
                        if len(text) > 10:
                            evaluate_as = EVAL_PARAGRAPH
                            continue

            self.autocue.execute(tag, evaluate_as)

        self.autocue.execute(self.soup, POST_TRAVERSAL)
        self.news_container = get_tag_with_max_score(self.soup)
        return True
コード例 #4
0
ファイル: reporter.py プロジェクト: pragnesh/reporter
    def read(self, url=None, html=None, soup=None, autocue=default_autocue):

        self.autocue = autocue

        if soup is not None:
            self.soup = soup
        else:
            if html is not None:
                self.soup = self._get_soup(html)
            else:
                if url is not None:
                    html = requests.get(url).content
                    self.soup = self._get_soup(html)
                else:
                    self.soup = None

        self.make_urls_absolute(self.soup, url)

        self.autocue.execute(self.soup, PRE_TRAVERSAL)

        # We work our way up the DOM
        for tag in reversed(self.soup.find_all()):

            if tag.name == 'p':
                evaluate_as = EVAL_PARAGRAPH
            else:
                # If tag contains text of its own, evaluate it as a paragraph
                evaluate_as = EVAL_CONTAINER
                for child in tag.children:
                    if isinstance(child, NavigableString):
                        text = unicode(child).strip()
                        if len(text) > 10:
                            evaluate_as = EVAL_PARAGRAPH
                            continue

            self.autocue.execute(tag, evaluate_as)

        self.autocue.execute(self.soup, POST_TRAVERSAL)
        self.news_container = get_tag_with_max_score(self.soup)