Beispiel #1
0
        def _parse_emails(doc):
            link_texts = doc.q("//a").join(' | ')

            for email in common.get_emails(link_texts):

                if '@' in email and email not in emails:
                    emails.append(email)

            if not emails:
                #try with text only, not links
                html = doc.remove("//script").html()
                for email in common.get_emails(html):

                    if '@' in email and email not in emails:
                        emails.append(email)
Beispiel #2
0
		def _parse_emails(doc):
			link_texts = doc.q("//a").join(' | ')
			
			for email in common.get_emails(link_texts):
			
				if '@' in email and email not in emails:
					emails.append(email)

			if not emails:
				#try with text only, not links
				html = doc.remove("//script").html()
				for email in common.get_emails(html):
			
					if '@' in email and email not in emails:
						emails.append(email)
Beispiel #3
0
    def find_emails(self, url, emails_dict, deep=2, link_filter=None):
        if not url: return []
        if not common.subreg(url, '^(http)'):
            url = 'http://' + url
        if '@' in url:
            return common.get_emails(url)
        if url not in emails_dict:
            emails_dict[url] = []

        if not link_filter:

            def link_filter(url):
                keywords = [
                    "contact", "about", "agent", "info", "imprint", "kontakt",
                    "uber", "wir", "impressum", "contacter", "representatives"
                ]
                for kw in keywords:
                    if kw.lower() in url:
                        return True
                return False

        def parse(doc):
            for email in common.get_emails(doc.html()):
                if email not in emails_dict[url]:
                    emails_dict[url].append(email)

        self.loop(url=url,
                  next="//a/@href | //iframe/@src",
                  deep=deep,
                  link_filter=link_filter,
                  cb=parse,
                  cc=10,
                  start_now=False)
Beispiel #4
0
	def find_emails(self, url, emails_dict, deep=2, link_filter=None):
		if not url: return []
		if not common.subreg(url, '^(http)'):
			url = 'http://'+url
		if '@' in url:
			return common.get_emails(url)	
		if url not in emails_dict:
			emails_dict[url] = []

		if not link_filter:		
			def link_filter(url):
				keywords = ["contact","about","agent","info","imprint","kontakt","uber","wir","impressum","contacter","representatives"]
				for kw in keywords:
					if kw.lower() in url:
						return True
				return False		

		def parse(doc):
			for email in common.get_emails(doc.html()):
				if email not in emails_dict[url]:
					emails_dict[url].append(email)

		self.loop(url=url,
			next="//a/@href | //iframe/@src",			
			deep=deep,
			link_filter = link_filter,
			cb = parse,
			cc=10,
			start_now = False

			)
Beispiel #5
0
    def _parse_emails(doc):
        emails = []
        #firstly try to get emails from the links only because it's more reliable
        link_texts = doc.q("//a").join(' | ')

        for email in common.get_emails(link_texts):

            if '@' in email and email not in emails:
                emails.append(email)

        if not emails:
            #try with text only, not links
            html = doc.remove("//script").html()
            for email in common.get_emails(html):

                if '@' in email and email not in emails:
                    emails.append(email)
        return emails
Beispiel #6
0
	def _parse_emails(doc):
		emails = []
		#firstly try to get emails from the links only because it's more reliable
		link_texts = doc.q("//a").join(' | ')
		
		for email in common.get_emails(link_texts):
		
			if '@' in email and email not in emails:
				emails.append(email)

		if not emails:
			#try with text only, not links
			html = doc.remove("//script").html()
			for email in common.get_emails(html):
		
				if '@' in email and email not in emails:
					emails.append(email)	
		return emails			
Beispiel #7
0
    def mine_emails(self, url):
        """ 
		looks for emails on key pages of a website: homepage, contact

		"""
        if not url: return []
        if not common.subreg(url, '^(http)'):
            url = 'http://' + url
        if '@' in url:
            return common.get_emails(url)
        domain = common.get_domain(url)
        emails = []

        def _parse_emails(doc):
            link_texts = doc.q("//a").join(' | ')

            for email in common.get_emails(link_texts):

                if '@' in email and email not in emails:
                    emails.append(email)

            if not emails:
                #try with text only, not links
                html = doc.remove("//script").html()
                for email in common.get_emails(html):

                    if '@' in email and email not in emails:
                        emails.append(email)

        homepage = self.load(url)
        _parse_emails(homepage)

        if emails:
            #no need to look on other page
            return emails

        contact_url = homepage.x(
            "//a[contains(@href,'contact') or contains(@href,'Contact')]/@href"
        )

        if contact_url:
            contactpage = self.load(contact_url)
            _parse_emails(contactpage)

        return emails
Beispiel #8
0
	def mine_emails(self, url):
		""" 
		looks for emails on key pages of a website: homepage, contact

		"""
		if not url: return []
		if not common.subreg(url, '^(http)'):
			url = 'http://'+url
		if '@' in url:
			return common.get_emails(url)
		domain = common.get_domain(url)
		emails = []
		def _parse_emails(doc):
			link_texts = doc.q("//a").join(' | ')
			
			for email in common.get_emails(link_texts):
			
				if '@' in email and email not in emails:
					emails.append(email)

			if not emails:
				#try with text only, not links
				html = doc.remove("//script").html()
				for email in common.get_emails(html):
			
					if '@' in email and email not in emails:
						emails.append(email)

		
		homepage = self.load(url)
		_parse_emails(homepage)
		
		
		if emails:
			#no need to look on other page
			return emails		

		contact_url = homepage.x("//a[contains(@href,'contact') or contains(@href,'Contact')]/@href")

		if contact_url:
			contactpage = self.load(contact_url)
			_parse_emails(contactpage)
		

		return emails
Beispiel #9
0
def mine_emails(url, br=None, deep_level=1):
    """
	deep_level = 1: scrape home page and contact page only

	"""

    if not url: return []
    if not common.subreg(url, '^(http)'):
        url = 'http://' + url
    if '@' in url:
        return common.get_emails(url)

    domain = common.get_domain(url).lower()

    history = {}

    def _load_page(page_url, current_level):
        """
		Please make sure this _url is not loaded yet, to avoid loaded twice

		"""
        logger.debug('mine_emails page %s, level %s', page_url, current_level)
        html = ''
        if br:
            try:
                br.get(page_url)

                html = br.page_source

            except Exception as e:
                logger.warn('failed to _load_page: %s', page_url)
                # logger.exception(e)
                raise e  #to trigger new br

        else:

            html = s.load_html(page_url)

        doc = Doc(url=page_url, html=html)
        #update loaded links
        links = doc.q("//a")

        sub_urls = []

        for link in links:
            _url = link.href()

            if domain not in _url.lower():
                continue

            if _url in history:
                continue
            if _url not in sub_urls:
                sub_urls.append(_url)

        history[page_url] = (current_level + 1, sub_urls)

        return doc

    def _parse_emails(doc):
        emails = []
        #firstly try to get emails from the links only because it's more reliable
        link_texts = doc.q("//a").join(' | ')

        for email in common.get_emails(link_texts):

            if '@' in email and email not in emails:
                emails.append(email)

        if not emails:
            #try with text only, not links
            html = doc.remove("//script").html()
            for email in common.get_emails(html):

                if '@' in email and email not in emails:
                    emails.append(email)
        return emails

    def _load_subpages(level):
        #firstly, compile all the urls of this level in the history
        urls = []
        for url in history:
            _level, suburls = history[url]
            if _level != level:
                continue

            for suburl in suburls:
                if suburl in history:
                    continue

                if suburl not in urls:
                    urls.append(suburl)

        logger.debug('mine emails in level %s, with %s urls to process', level,
                     len(urls))
        for suburl in urls:

            doc = _load_page(suburl, level)
            emails = _parse_emails(doc)
            if emails:
                #found emails on this page, enough
                return emails

        #not found
        return []

    doc = _load_page(url, current_level=1)
    emails = _parse_emails(doc)

    if emails:
        return emails

    contact_url = doc.x(
        "//a[contains(@href,'contact') or contains(@href,'Contact')]/@href")
    if contact_url:
        doc = _load_page(contact_url, current_level=2)
        emails = _parse_emails(doc)

        #when a contact page found, no need to dig further even if no emails found

        return emails

    #try with level 2

    if deep_level >= 2:
        emails = _load_subpages(level=2)
        if emails:
            return emails

    #try with level 3

    if deep_level >= 3:
        emails = _load_subpages(level=3)
        if emails:
            return emails

    #not found
    return []
Beispiel #10
0
 def parse(doc):
     for email in common.get_emails(doc.html()):
         if email not in emails_dict[url]:
             emails_dict[url].append(email)
Beispiel #11
0
		def parse(doc):
			for email in common.get_emails(doc.html()):
				if email not in emails_dict[url]:
					emails_dict[url].append(email)
Beispiel #12
0
def mine_emails(url, br=None, deep_level=1):
	"""
	deep_level = 1: scrape home page and contact page only

	"""
	

	if not url: return []
	if not common.subreg(url, '^(http)'):
		url = 'http://'+url
	if '@' in url:
		return common.get_emails(url)

	domain = common.get_domain(url).lower()

	history = {}

	def _load_page(page_url, current_level):
		"""
		Please make sure this _url is not loaded yet, to avoid loaded twice

		"""
		logger.debug('mine_emails page %s, level %s', page_url, current_level)
		html = ''
		if br:
			try:
				br.get(page_url)

				html = br.page_source

			except Exception as e:
				logger.warn('failed to _load_page: %s', page_url)
				# logger.exception(e)
				raise e #to trigger new br

		else:
			
			html = s.load_html(page_url)		

		
		doc = Doc(url=page_url, html=html)
		#update loaded links
		links = doc.q("//a")
		
		sub_urls = []

		for link in links:
			_url = link.href()

			if domain not in _url.lower():
				continue

			if _url in history:
				continue
			if _url not in sub_urls:	
				sub_urls.append(_url)	

		history[page_url] = (current_level+1, sub_urls)


		return doc		
				


	def _parse_emails(doc):
		emails = []
		#firstly try to get emails from the links only because it's more reliable
		link_texts = doc.q("//a").join(' | ')
		
		for email in common.get_emails(link_texts):
		
			if '@' in email and email not in emails:
				emails.append(email)

		if not emails:
			#try with text only, not links
			html = doc.remove("//script").html()
			for email in common.get_emails(html):
		
				if '@' in email and email not in emails:
					emails.append(email)	
		return emails			

	def _load_subpages(level):
		#firstly, compile all the urls of this level in the history
		urls = []
		for url in history:
			_level, suburls = history[url]
			if _level != level:
				continue

			for suburl in suburls:
				if suburl in history:
					continue

				if suburl not in urls:
					urls.append(suburl)	

		logger.debug('mine emails in level %s, with %s urls to process', level, len(urls))
		for suburl in urls:			
		
			doc = _load_page(suburl, level)
			emails = _parse_emails(doc)
			if emails:
				#found emails on this page, enough
				return emails

		#not found
		return []		

	
	doc = _load_page(url, current_level = 1)
	emails = _parse_emails(doc)
	
	if emails:
		return emails

	contact_url = doc.x("//a[contains(@href,'contact') or contains(@href,'Contact')]/@href")
	if contact_url:
		doc = _load_page(contact_url, current_level = 2)
		emails = _parse_emails(doc)
		
		#when a contact page found, no need to dig further even if no emails found

		return emails
		
	
	#try with level 2

	if deep_level >=2:
		emails = _load_subpages(level=2)
		if emails:
			return emails

	#try with level 3

	if deep_level >=3:
		emails = _load_subpages(level=3)
		if emails:
			return emails

	
	#not found
	return []