Exemple #1
0
 def search_email_in_domain(self, domain):
     self.check_driver()
     try:
         self._go_to_page(domain)
         soup = BS(self.driver.page_source, "lxml")
         # Find email on page and on link
         _email_founds = []
         for pos in POSSIBLE_POSITION:
             action_list = {
                 'on_page': self._on_page,
                 'on_link': self._on_link
             }
             email = action_list.get(pos)(soup, domain)
             _email_founds.append(email)
         email_candidates = Utility.flatten_list(_email_founds)
         if str(domain).endswith('.id') or str(domain).endswith('.id/'):
             emails = self.search_id_domain(domain)
             if emails not in email_candidates:
                 email_candidates += emails
         if not email_candidates:
             # If email not found
             self.logger.info('Email not found on domain %s', domain)
             # Find it using whois
             return []
         else:
             # If email found, filter it
             final_candidates = self._filter_email_candidates(
                 email_candidates)
             return self.sort_email(final_candidates, domain)
     except Exception as exc:
         print "Error on domain {} {} ".format(domain, str(exc))
         return []
Exemple #2
0
 def _on_link(self, page, domain):
     self.logger.info("Search email address on link to another page")
     _email_founds = []
     # Find all possible link element
     links = page.findAll('a')
     # Find all candidate link with keyword on html page
     keyword_html_link = self._find_keyword_in_html_text(links)
     # Find all candidate link with keyword on url
     keyword_url_link = self._find_keyword_in_url(links, domain)
     # Merge the url result, remove duplicate url
     candidate_links = Utility.uniquify(keyword_html_link +
                                        keyword_url_link)
     # Check for invalid url and try to fix it
     invalid_url = [
         uri for uri in candidate_links if not cfg.url_regex.match(uri)
     ]
     try_fix_invalid_url = map(
         lambda _uri: Utility.normalize_invalid_url(_uri, domain),
         invalid_url)
     # Filter invalid url
     candidate_links = candidate_links + try_fix_invalid_url
     candidate_links = Utility.uniquify(
         [_uri for _uri in candidate_links if cfg.url_regex.match(_uri)])
     try:
         for link in candidate_links:
             self.logger.info("Go to next link: " + link)
             try:
                 self._go_to_page(link)
             except Exception, err:
                 print str(err)
                 continue
             soup = BS(self.driver.page_source, "lxml")
             email = self._on_page(soup, domain)
             _email_founds.append(email)
         return _email_founds if not _email_founds else Utility.flatten_list(
             _email_founds)
Exemple #3
0
        try:
            for link in candidate_links:
                self.logger.info("Go to next link: " + link)
                try:
                    self._go_to_page(link)
                except Exception, err:
                    print str(err)
                    continue
                soup = BS(self.driver.page_source, "lxml")
                email = self._on_page(soup, domain)
                _email_founds.append(email)
            return _email_founds if not _email_founds else Utility.flatten_list(
                _email_founds)
        except Exception, e:
            logging.error(str(e))
            return _email_founds if not _email_founds else Utility.flatten_list(
                _email_founds)

    def sort_email(self, emails, domain):
        # If this is not governor's domain, do not get any email candidate with .go.id domain name
        if '.go.id' not in domain:
            emails = [email for email in emails if '.go.id' not in emails]
        domain_name = Utility.find_domain_name(domain)
        emails = map(lambda email: (email, domain_name), emails)
        # Sort based on score descending
        emails.sort(cmp=lambda a, b: -1
                    if self.email_scoring(a) > self.email_scoring(b) else 0)
        emails = [x for x, y in emails]
        return emails[:cfg.max_email]

    @staticmethod
    def email_scoring(email_payload):