コード例 #1
0
 def fetch_domains(self):
     self.domains = []
     for n in self.links:
         url = Link(n, self.url)
         if url.is_valid():
             self.domains.append(url.domain)
     return self.domains
コード例 #2
0
 def fetch_domains_id(self):
     self.domain_ids = []
     for n in self.links:
         url = Link(n, self.url)
         if url.is_valid():
             self.domain_ids.append(url.netloc)
     return self.domain_ids
コード例 #3
0
ファイル: article.py プロジェクト: malstor/crawtext
	def fetch_domains_id(self):
		self.domain_ids = []
		for n in self.links:
			url = Link(n, self.url)
			if url.is_valid():
				self.domain_ids.append(url.netloc)
		return self.domain_ids
コード例 #4
0
ファイル: article.py プロジェクト: malstor/crawtext
	def fetch_domains(self):
		self.domains = []
		for n in self.links:
			url = Link(n, self.url)
			if url.is_valid():
				self.domains.append(url.domain)
		return self.domains
コード例 #5
0
 def is_valid(self):
     #logging.info("Valid url?")
     url = Link(self.url, self.source_url, self.debug)
     if url.is_valid():
         #logging.info("Yes")
         return True
     else:
         self.msg = url.msg
         self.code = url.code
         self.step = "Validating page"
         self.status = False
         return False
コード例 #6
0
ファイル: article.py プロジェクト: malstor/crawtext
	def is_valid(self):
		#logging.info("Valid url?")
		url = Link(self.url, self.source_url, self.debug)
		if url.is_valid():
			#logging.info("Yes")
			return True
		else:
			self.msg = url.msg
			self.code = url.code
			self.step = "Validating page"
			self.status = False
			return False
コード例 #7
0
ファイル: article.py プロジェクト: malstor/crawtext
	def fetch_links(self):
		''' extract raw_links and domains '''
		self.domains = []
		self.links = []
		self.domain_ids = []
		links = [n.get('href') for n in self.doc.find_all("a")]
		links = [n for n in links if n is not None and n != "" and n != "/" and n[0] !="#"]

		for url in links:
			if url.startswith('mailto'):
				pass
			if url.startswith('javascript'):
				pass
			else:
				l = Link(url)
				if l.is_valid():
					url, domain, domain_id = l.clean_url(url, self.url)
					self.domains.append(domain)
					self.links.append(url)
					self.domain_ids.append(domain_id)
		return (self.links, self.domains, self.domain_ids)
コード例 #8
0
    def fetch_links(self):
        ''' extract raw_links and domains '''
        self.domains = []
        self.links = []
        self.domain_ids = []
        links = [n.get('href') for n in self.doc.find_all("a")]
        links = [
            n for n in links
            if n is not None and n != "" and n != "/" and n[0] != "#"
        ]

        for url in links:
            if url.startswith('mailto'):
                pass
            if url.startswith('javascript'):
                pass
            else:
                l = Link(url)
                if l.is_valid():
                    url, domain, domain_id = l.clean_url(url, self.url)
                    self.domains.append(domain)
                    self.links.append(url)
                    self.domain_ids.append(domain_id)
        return (self.links, self.domains, self.domain_ids)
コード例 #9
0
 def check_link(self, url, source_url):
     url = Link(url, source_url)
     if url.is_valid():
         return url.url
     else:
         return None
コード例 #10
0
ファイル: article.py プロジェクト: malstor/crawtext
	def check_link(self, url, source_url):
		url = Link(url, source_url)
		if url.is_valid():
			return url.url
		else:
			return None