Ejemplo n.º 1
0
    def __init__(self, key, proxy=None):

        self.base_url = "https://api.datamarket.azure.com/Bing/SearchWeb/v1/Web?$format=json&"
        # Building authentification from key
        s = '%s:%s' % (key, key)
        credentials = base64.b64encode(s.encode('utf-8'))
        self.auth = 'Basic %s' % credentials.decode('utf-8')

        # Markets for localized and more accurate search
        self.markets = {
            "FR": "fr-FR",
            "BE": "fr-BE",
            "GB": "en-GB",
            "US": "en-US",
            "DE": "de-DE",
            "UK": "en-GB"
        }

        # Fetcher initialization
        self.fetcher = Fetcher(proxy=proxy)
        self.fetcher.headers["Authorization"] = self.auth

        # Logging initialization
        self.logger = logging.getLogger("webmining:bingapi_fetcher")
        self.logger.setLevel(logging.INFO)
        self.wrapper = HTML5Wrapper()
Ejemplo n.º 2
0
    def __init__(self, proxy):
        self.fetcher = Fetcher(proxy=proxy)

        # CMS identifiables via a specific URL
        self.paths = {
            "wordpress": {
                "path": "wp-login.php",
                "expression": "wordpress"
            },
            "drupal": {
                "path": "user",
                "expression": "user-login"
            },
            "isotools": {
                "path": "identification.aspx",
                "expression": "isotools"
            },
            "joomla": {
                "path": "administrator",
                "expression": "joomla"
            },
            "spip": {
                "path": "?page=login",
                "expression": "spip"
            }
        }
        # CMS identifiables via a specific pattern in HTML
        self.patterns = {
            "typo3": {
                "expression": "this website is powered by typo3"
            },
            "ezpublish": {
                "expression": "/content/advancedsearch"
            }
        }
Ejemplo n.º 3
0
 def __init__(self, proxy=None):
     self.fetcher = Fetcher(proxy=proxy)
     self.normalizer = normalizer.Normalizer()
     self.wrapper = HTML5Wrapper()
     self._rdomain = re.compile("^[a-z]{2,3}\\.linkedin\\.com$")
     self._rpath1 = re.compile(
         "^\\/pub\\/[^\\/]+(\\/[0-9abcdef]{1,3}){3}(\\/[a-zA-Z]+)?$")
     self._rpath2 = re.compile("^\\/in\\/[^\\/]+")
     self._rtitle = re.compile("^(.+) - ([^\\|]+) \\| LinkedIn$")
Ejemplo n.º 4
0
    def __init__(self, tld="fr", proxy=None):
        self.fetcher = Fetcher(proxy=proxy)
        # No tld based differences in bing, the tld will be ignored
        # http://fr.search.yahoo.com/search?p=LE+PAC+DECOUPE+PORTET+SUR+GARONNE&toggle=1&cop=mss&ei=UTF-8&fr=yfp-t-703
        self.base_url = "http://fr.search.yahoo.com/search?toggle=1&cop=mss&ei=UTF-8&fr=yfp-t-703&"

        # Logging initialization
        self.logger = logging.getLogger("webmining:yahoo_fetcher")
        self.logger.setLevel(logging.INFO)
        self.wrapper = HTML5Wrapper()
Ejemplo n.º 5
0
 def __init__(self, proxy=None):
     self.fetcher = Fetcher(proxy=proxy)
     self.wrapper = HTML5Wrapper()
     self.normalizer = normalizer.Normalizer()
     self.valid_domains = re.compile("^.*.viadeo." + self.LANG + "$")
     self._rpath1 = re.compile("^\\/" + self.LANG +
                               "\\/profile\\/([^\\/]+).*$")
     self._rpath2 = re.compile("^\\/r\\/profile\\/([^\\/]+)\\/" +
                               self.LANG + "\\/public(\\/.*)?$")
     self._rtitle = re.compile("^([^,]+).*$")
Ejemplo n.º 6
0
  def __init__(self, tld="fr", proxy=None):
    self.fetcher = Fetcher(proxy=proxy)
    self.cookie = {"_FS": "NU=1&mkt=fr-FR&ui=fr-FR"}
    # No tld based differences in bing, the tld will be ignored
    self.base_url = "http://www.bing.com/search?qs=n&form=QBLH&filt=all&sc=0-13&sp=-1&sk=&pq="

    # Logging initialization
    self.logger = logging.getLogger("webmining:bing_fetcher")
    self.logger.setLevel(logging.INFO)
    self.wrapper = HTML5Wrapper()
Ejemplo n.º 7
0
class ShoppingExtractor:
    # Shopping: OSCommerce, Prestashop, Magento, Open Cart
    def __init__(self, proxy):
        self.fetcher = Fetcher(proxy=proxy)

        # Shopping engines identifiables via a specific cookie
        self.cookies = {"oscommerce": {"cookie": "osCsid"}}

        # Shopping engines identifiables via a specific pattern in HTML
        self.patterns = {
            "prestashop": {
                "tag": "meta",
                "attribute": "content",
                "expression": "prestashop"
            },
            "magento": {
                "tag": "link",
                "attribute": "href",
                "expression": "/skin/frontend/"
            },
            "opencart": {
                "tag": "a",
                "attribute": "href",
                "expression": "route=checkout/cart"
            }
        }

    def extract(self,
                dom,
                raw_txt,
                relevant_txt,
                url,
                firstpage,
                country="FR",
                lang="FR"):
        results = {"ecommerce": []}

        # This needs a fetch, we only do it for the first page of the crawl
        if firstpage:
            # CMS identifiables via a specific URL
            for shop in self.cookies:
                fr = self.fetcher.fetch(url)
                if fr is not None and self.cookies[shop][
                        "cookie"] in fr.cookies.keys():
                    results["ecommerce"].append({"type": shop, "url": url})

                    return results

        # CMS identifiables via a specific pattern in HTML
        for shop in self.patterns:
            tags = dom(self.patterns[shop]["tag"] + "[" +
                       self.patterns[shop]["attribute"] + "]")
            for tag in tags.items():
                if self.patterns[shop]["expression"] in (
                        tag.attr[self.patterns[shop]["attribute"]]
                        or "").lower():
                    results["ecommerce"].append({"type": shop, "url": url})
                    return results

        return results
Ejemplo n.º 8
0
class YahooFetcher:
    """
  Fetches Yahoo results for a given query
  """
    def __init__(self, tld="fr", proxy=None):
        self.fetcher = Fetcher(proxy=proxy)
        # No tld based differences in bing, the tld will be ignored
        # http://fr.search.yahoo.com/search?p=LE+PAC+DECOUPE+PORTET+SUR+GARONNE&toggle=1&cop=mss&ei=UTF-8&fr=yfp-t-703
        self.base_url = "http://fr.search.yahoo.com/search?toggle=1&cop=mss&ei=UTF-8&fr=yfp-t-703&"

        # Logging initialization
        self.logger = logging.getLogger("webmining:yahoo_fetcher")
        self.logger.setLevel(logging.INFO)
        self.wrapper = HTML5Wrapper()

    def parse(self, webpage, bresults, limit):
        webpage = self.wrapper.pq(webpage)
        if webpage is not None:
            for r in webpage(".res").items():
                gr = YahooResult(r, self.wrapper)
                bresults.append(gr)

                if len(bresults) >= limit:
                    break
        return webpage

    def fetch(self, q, limit=10, start=0):
        """
    Fetches Yahoo with the query q and sends back 
    a list of results.
    param: q: a query, as a string
    param: limit: the amount of results needed (1 to 10)
    param: start: the starting offset
    return: a list of YahooResult
    """
        bresults = []
        # NB: the parameter to augment the amount of results is 'count'
        query = urllib.parse.urlencode({"p": q})
        url = self.base_url + query
        fr = self.fetcher.fetch(url, debug=True)

        self.logger.debug("Fetched url [%s]" % url)
        if fr is None or fr.webpage is None:
            self.logger.warn("Got nothing from [%s]" % url)
            return bresults

        self.logger.debug("Returned result - " + str(fr.fetched_url))
        f = open("index.html", "w")
        f.write(fr.webpage)
        f.close()
        fr.webpage = self.parse(fr.webpage, bresults, limit)

        self.logger.info("Fetched [%s] with %d results" % (url, len(bresults)))
        return bresults
Ejemplo n.º 9
0
    def __init__(self,
                 filename=None,
                 seedlist=None,
                 debug=False,
                 proxy=None,
                 multiproc=True,
                 mode=CrawlMode.entire,
                 max_page_size=PAGE_SIZE_LIMIT):
        """
        :param filename: path to the seed file
        :param mode: crawling mode, either "entire", "single", "subpath"
        """
        self.seed = None
        self.debug = debug
        # init the fetcher with a download limit size
        self.fetcher = Fetcher(proxy, max_page_size=max_page_size)
        self.htmltools = HTML5Wrapper()
        self.crawl_depth = 0  # Do we crawl domains outside the seed
        self.domain_depth = 0  # At which depth each seed element must be crawled
        self.page_limit = 0  # Max amount of pages to be crawled
        self.max_page_size = max_page_size
        self.website = Website()
        self.me = MetaExtractor(proxy=proxy)

        self.badextensions = set(["pdf", "xls", "doc", "ppt", "rtf", "odt", "zip", "tar.gz", "tar", "exe", \
                                  "jpg", "png", "jpeg", "bmp", "gif", "mp3", "flv", "rar", "ogv", "avi", "mp4", \
                                  "mkg", "ps", "ogg", "webm", "ogm", "pps", "pptx", "docx", "xlsx", "mpg", "mov", \
                                  "mkv", "mpeg", "m4v", "iso"])

        self.crawling_process_over = False

        # Logging initialization
        self.logger = logging.getLogger("webmining:crawler")
        self.logger.setLevel(logging.INFO)
        if debug:
            self.logger.setLevel(logging.DEBUG)
        self.filename = filename
        self.seedlist = seedlist
        self.mode = mode
        self.authorized_domains = set()
Ejemplo n.º 10
0
class BingFetcher:
  """
  Fetches Bing results for a given query
  """
  def __init__(self, tld="fr", proxy=None):
    self.fetcher = Fetcher(proxy=proxy)
    self.cookie = {"_FS": "NU=1&mkt=fr-FR&ui=fr-FR"}
    # No tld based differences in bing, the tld will be ignored
    self.base_url = "http://www.bing.com/search?qs=n&form=QBLH&filt=all&sc=0-13&sp=-1&sk=&pq="

    # Logging initialization
    self.logger = logging.getLogger("webmining:bing_fetcher")
    self.logger.setLevel(logging.INFO)
    self.wrapper = HTML5Wrapper()

  def parse(self, webpage, bresults, limit):
    webpage = self.wrapper.pq( webpage )
    if webpage is not None:
      for r in webpage("li.sa_wr").items():
        gr = BingResult(r, self.wrapper)
        bresults.append(gr)

        if len(bresults) >= limit:
          break
    return webpage

  def fetch(self, q, limit=10, start=0):
    """
    Fetches Bing with the query q and sends back 
    a list of results.
    param: q: a query, as a string
    param: limit: the amount of results needed (1 to 10)
    param: start: the starting offset
    return: a list of BingResult
    """
    bresults = []
    # NB: the parameter to augment the amount of results is 'count'
    query = urllib.parse.urlencode( { "q": q, "first": start } )
    url = self.base_url + query
    fr = self.fetcher.fetch(url, debug=True, cookies=self.cookie)
   
    self.logger.debug("Fetched url [%s]" % url)
    if fr is None or fr.webpage is None:
      self.logger.warn("Got nothing from [%s]" % url)
      return bresults
    
    self.logger.debug("Returned result - " + str(fr.fetched_url))
    fr.webpage = self.parse(fr.webpage, bresults, limit)
    
    self.logger.info("Fetched [%s] with %d results" % (url, len(bresults)))
    return bresults
Ejemplo n.º 11
0
    def __init__(self, proxy):
        self.fetcher = Fetcher(proxy=proxy)

        # Shopping engines identifiables via a specific cookie
        self.cookies = {"oscommerce": {"cookie": "osCsid"}}

        # Shopping engines identifiables via a specific pattern in HTML
        self.patterns = {
            "prestashop": {
                "tag": "meta",
                "attribute": "content",
                "expression": "prestashop"
            },
            "magento": {
                "tag": "link",
                "attribute": "href",
                "expression": "/skin/frontend/"
            },
            "opencart": {
                "tag": "a",
                "attribute": "href",
                "expression": "route=checkout/cart"
            }
        }
Ejemplo n.º 12
0
class GoogleFetcher:
    """
  Fetches google results for a given query
  """
    def __init__(self, tld="fr", proxy=None):
        self.fetcher = Fetcher(proxy=proxy)
        self.wrapper = HTML5Wrapper()
        self.base_url = "http://www.google.%s/search?rls=en&ie=UTF-8&oe=UTF-8&" % tld

    def parse(self, webpage, gresults, limit):
        webpage = self.wrapper.pq(webpage)
        #html = html.decode( "utf-8" )
        if webpage is not None:
            for r in webpage(".g").items():
                gr = GoogleResult(r, self.wrapper)
                gresults.append(gr)

                if len(gresults) >= limit:
                    break
        return webpage

    def fetch(self, q, limit=10, start=0):
        """
    Fetches Google with the query q and sends back 
    a list of results.
    param: q: a query, as a string
    param: limit: the amount of results needed (1 to 10)
    param: start: the starting offset
    return: a list of GoogleResult
    """
        gresults = []
        query = urllib.parse.urlencode({"q": q, "start": start})
        url = self.base_url + query
        fr = self.fetcher.fetch(url, debug=True)

        if fr is None or fr.webpage is None:
            return gresults

        if fr.fetched_url.startswith("http://www.google.fr/sorry/"):
            raise GoogleBlacklistingError()

        fr.webpage = self.parse(fr.webpage, gresults, limit)
        return gresults
Ejemplo n.º 13
0
class CMSExtractor:
    # CMS: Wordpress, Drupal, Typo3, IsoTools, Joomla, Spip, EzPublish
    def __init__(self, proxy):
        self.fetcher = Fetcher(proxy=proxy)

        # CMS identifiables via a specific URL
        self.paths = {
            "wordpress": {
                "path": "wp-login.php",
                "expression": "wordpress"
            },
            "drupal": {
                "path": "user",
                "expression": "user-login"
            },
            "isotools": {
                "path": "identification.aspx",
                "expression": "isotools"
            },
            "joomla": {
                "path": "administrator",
                "expression": "joomla"
            },
            "spip": {
                "path": "?page=login",
                "expression": "spip"
            }
        }
        # CMS identifiables via a specific pattern in HTML
        self.patterns = {
            "typo3": {
                "expression": "this website is powered by typo3"
            },
            "ezpublish": {
                "expression": "/content/advancedsearch"
            }
        }

    def extract(self,
                dom,
                raw_txt,
                relevant_txt,
                url,
                firstpage,
                country="FR",
                lang="FR"):
        results = {"cms": []}
        found = set()

        # CMS identifiables via a specific URL
        # This needs a fetch, we only do it for the first page of the crawl
        if firstpage:
            for cms in self.paths:
                up = urlparse(url)
                domain = up.scheme + "://" + up.netloc
                link = urljoin(domain, self.paths[cms]["path"])
                fr = self.fetcher.fetch(link)

                if fr is not None and fr.webpage is not None and \
                   fr.content_type is not None and "text/html" in fr.content_type.lower() and \
                   self.paths[cms]["expression"] in fr.webpage.lower():

                    if cms not in found:
                        results["cms"].append({"type": cms, "url": link})
                        found.add(cms)
                        # return results

        # CMS identifiables via a specific pattern in HTML
        for cms in self.patterns:
            if self.patterns[cms]["expression"] in raw_txt.lower():
                if cms not in found:
                    results["cms"].append({"type": cms, "url": url})
                    found.add(cms)
                    # return results

        # detect typo3 via meta as well
        if "typo3" not in results and len(
                dom("meta[name='generator'][content*='TYPO3']")) > 0:
            cms = "typo3"
            if cms not in found:
                results["cms"].append({"type": cms, "url": url})
                found.add(cms)

        return results
Ejemplo n.º 14
0
class LinkedinFetcher:
    def __init__(self, proxy=None):
        self.fetcher = Fetcher(proxy=proxy)
        self.normalizer = normalizer.Normalizer()
        self.wrapper = HTML5Wrapper()
        self._rdomain = re.compile("^[a-z]{2,3}\\.linkedin\\.com$")
        self._rpath1 = re.compile(
            "^\\/pub\\/[^\\/]+(\\/[0-9abcdef]{1,3}){3}(\\/[a-zA-Z]+)?$")
        self._rpath2 = re.compile("^\\/in\\/[^\\/]+")
        self._rtitle = re.compile("^(.+) - ([^\\|]+) \\| LinkedIn$")

    def validate_url(self, domain, path):
        """
    Validates if an url is a linkedin profile or not.
    param: domain: The URL domain
    param: path: The URL path
    return: true/false
    """
        # Valid domain and profile path
        return self._rdomain.match(domain) is not None and (
            self._rpath1.match(path) is not None
            or self._rpath2.match(path) is not None)

    def validate_contact(self, title, firstname, lastname):
        """
    Validates if the profile page corresponds to the specified contact.
    param: title: The page title
    param: firstname: The contact first name
    param: lastname: The contact last name
    return: True if the page corresponds to the specified contact, False otherwise
    """
        # Extract name from title
        m = self._rtitle.search(title)
        # Matching title
        if m is not None:
            return self.normalize_name(
                m.group(1)) == self.normalize_name(firstname + lastname)
            # Invalid
        return False

    def normalize_name(self, name):
        """
    Normalize a name for comparison
    param: name: The name to normalize
    return: The normalized name for comparison (firstname + lastname, lowercase ASCII, withtout separators)
    """
        text = re.sub('[\-0-9\s]+', '', name)
        text = self.normalizer.normalize_text(text)
        return text

    def parse(self, fr):
        html = self.wrapper.pq(fr.webpage)
        lr = LinkedinResult(html, url=fr.fetched_url, wrapper=self.wrapper)
        return lr

    def extract_profile(self, url):
        """
    Fetches profile URL and cleans html.
    """
        fr = self.fetcher.fetch(url, debug=False)
        if fr is None or fr.webpage is None or fr.http_status >= 400:
            return None
        lr = self.parse(fr)
        return lr
Ejemplo n.º 15
0
 def __init__(self, token):
     self.fetcher = Fetcher()
     self.logger = logging.getLogger("fbapi")
     self.token = "%s|%s" % (token["app_id"], token["secret_id"])
     self.logger.setLevel(logging.INFO)
Ejemplo n.º 16
0
class FBAPI:
    """
    Interrogates Facebook API to get various pieces of information
    from pages.
    """
    def __init__(self, token):
        self.fetcher = Fetcher()
        self.logger = logging.getLogger("fbapi")
        self.token = "%s|%s" % (token["app_id"], token["secret_id"])
        self.logger.setLevel(logging.INFO)

    def get_graph(self, fburl):
        """
        Gets the graph API json from a facebook url

        param: fburl: fb page as an URL
        :returns: a string, or None if nothing found.
        """
        # Building graph URL from company page url
        # https://graph.facebook.com/datapublica
        account = get_facebook_account(fburl)
        if account is None:
            return None

        # See bug https://data-publica.atlassian.net/browse/RAD-265
        if b"\x85" in account.encode():
            return None

        url = FB_API_URL + account + "?access_token=" + self.token

        data = self.fetcher.fetch(url)
        jdata = json.loads(data.webpage)
        if "error" in jdata.keys():
            if jdata["error"]["code"] == 4:
                self.logger.warn("Rate limit exceeded")
                raise RateLimitExceeded()

            if jdata["error"]["code"] == 803:
                self.logger.warn(
                    "Couldn't find company FB page for URL %s, built into %s" %
                    (fburl, url))
                return None

            elif jdata["error"]["code"] == 100:
                self.logger.warn(
                    "Couldn't access FB page for URL %s, badly built into %s" %
                    (fburl, url))
                return None

            elif jdata["error"]["code"] == 104:
                self.logger.warn(
                    "Auhentification request for FB page with URL %s, built into %s"
                    % (fburl, url))
                return None

            elif jdata["error"]["code"] == 2500:
                self.logger.warn(
                    "Unkown path to FB page with URL %s, built into %s" %
                    (fburl, url))
                return None

            elif jdata["error"]["code"] == 12:
                self.logger.warn(
                    "Call to deprecated FB point with URL %s, built into %s" %
                    (fburl, url))
                return None

            elif jdata["error"]["code"] == 21:
                m = re.search("to page ID (\d+).", jdata["error"]["message"])
                self.logger.info("FB page with URL %s, was migrated into %s" %
                                 (fburl, m.group(1)))
                return self.get_graph("https://graph.facebook.com/" +
                                      m.group(1))

            else:
                raise Exception(
                    "Unknown error %d : %s" %
                    (jdata["error"]["code"], jdata["error"]["message"]))
        return jdata

    def get_company(self, fburl):
        """
        Gets a company overview from  a company facebook page.

        param: fburl: fb page as an URL
        :returns: a string, or None if nothing found.
        """

        graph = self.get_graph(fburl)
        return self.get_company_from_data(graph)

    @staticmethod
    def get_company_from_data(fbdata):
        if fbdata is None:
            return None

        return FBCompany(fbdata)

    def get_picture(self, account):
        account = get_facebook_account(account)
        if account is None:
            return None

        url = "https://graph.facebook.com/%s/picture?redirect=false&type=large" % account
        data = self.fetcher.fetch(url)
        jdata = json.loads(data.webpage)
        if "error" in jdata:
            return None
        if "data" not in jdata:
            return None
        return jdata["data"]
Ejemplo n.º 17
0
    def __init__(self, tld_file, proxy, wrapper, check_supported=True):
        # Logging initialization
        self.logger = logging.getLogger("webmining:communication_extractor")
        self.logger.setLevel(logging.INFO)

        self.check_supported = check_supported
        # Communication: SIREN, phone, contact form, emails, RSS, RSS/week, legal mention,
        #                Mobile site, responsive site
        self.metas = [
            "localId", "phone", "email", "contact", "contactform", "legal",
            "useterms", "rss", "mobile", "responsive", "capital",
            "description", "addresses"
        ]

        self.check_supported = check_supported

        # Loading all the localized resources
        resources_dir = os.path.join(LIB_PATH, "resources/localization")

        if not os.path.exists(resources_dir):
            raise NotImplementedError("No resources")

        # Cache where country specific resources are cached
        self.localization_cache = {}

        # Cache containing the current domain’s fetched rss links
        self.rss_cache = set()

        # Cache containing information for email filtering
        self.email_filtering_data = None
        with open(os.path.join(LIB_PATH, "resources", "email_filtering.json"),
                  "r") as f:
            self.email_filtering_data = json.load(f)

        # Iterating over country specific resources
        for path in os.listdir(resources_dir):
            # We consider that all directories in the resources_dir represent a
            # country
            if os.path.isdir(os.path.join(resources_dir, path)):
                country_name = path
                country_path = os.path.join(resources_dir, path)
                country = namedtuple(
                    "country",
                    ["legals", "useterms", "identification", "generic_emails"])

                with open(os.path.join(country_path, "generic_emails.txt"),
                          "r") as f:
                    country.generic_emails = set(map(str.strip, f.readlines()))

                with open(os.path.join(country_path, "legals.txt"), "r") as f:
                    country.legals = set(map(str.strip, f.readlines()))

                with open(os.path.join(country_path, "useterms.txt"),
                          "r") as f:
                    country.useterms = set(map(str.strip, f.readlines()))

                with open(os.path.join(country_path, "identification.txt"),
                          "r") as f:
                    country.identification = set(
                        map(lambda x: re.compile(x.strip()), f.readlines()))

                self.localization_cache[country_name] = country

        self.contacter = ContactDetecter()
        self.extor = Extractor()
        self.ad = address_detecter.AddressDetecter(
            cache_results=True, check_supported=check_supported)
        self.tlds = set()
        self.tel = PhoneDetecter()
        self.fetcher = Fetcher(proxy=proxy)
        self.iosfetcher = Fetcher(
            proxy=proxy,
            user_agent=
            "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3"
        )

        self.wrapper = wrapper
        # Used to tell when to empty the address detecter’s results cache
        # i.e. when we change website
        self.previous_domain = None

        # Allow countries to specify an other country code for phone detection
        self.phone_country = {'UK': 'GB'}

        # TLDin this file are from IANA organisation
        with open(tld_file) as f:
            for tld in f:
                self.tlds.add(tld.strip())
Ejemplo n.º 18
0
class CommunicationExtractor:
    def __init__(self, tld_file, proxy, wrapper, check_supported=True):
        # Logging initialization
        self.logger = logging.getLogger("webmining:communication_extractor")
        self.logger.setLevel(logging.INFO)

        self.check_supported = check_supported
        # Communication: SIREN, phone, contact form, emails, RSS, RSS/week, legal mention,
        #                Mobile site, responsive site
        self.metas = [
            "localId", "phone", "email", "contact", "contactform", "legal",
            "useterms", "rss", "mobile", "responsive", "capital",
            "description", "addresses"
        ]

        self.check_supported = check_supported

        # Loading all the localized resources
        resources_dir = os.path.join(LIB_PATH, "resources/localization")

        if not os.path.exists(resources_dir):
            raise NotImplementedError("No resources")

        # Cache where country specific resources are cached
        self.localization_cache = {}

        # Cache containing the current domain’s fetched rss links
        self.rss_cache = set()

        # Cache containing information for email filtering
        self.email_filtering_data = None
        with open(os.path.join(LIB_PATH, "resources", "email_filtering.json"),
                  "r") as f:
            self.email_filtering_data = json.load(f)

        # Iterating over country specific resources
        for path in os.listdir(resources_dir):
            # We consider that all directories in the resources_dir represent a
            # country
            if os.path.isdir(os.path.join(resources_dir, path)):
                country_name = path
                country_path = os.path.join(resources_dir, path)
                country = namedtuple(
                    "country",
                    ["legals", "useterms", "identification", "generic_emails"])

                with open(os.path.join(country_path, "generic_emails.txt"),
                          "r") as f:
                    country.generic_emails = set(map(str.strip, f.readlines()))

                with open(os.path.join(country_path, "legals.txt"), "r") as f:
                    country.legals = set(map(str.strip, f.readlines()))

                with open(os.path.join(country_path, "useterms.txt"),
                          "r") as f:
                    country.useterms = set(map(str.strip, f.readlines()))

                with open(os.path.join(country_path, "identification.txt"),
                          "r") as f:
                    country.identification = set(
                        map(lambda x: re.compile(x.strip()), f.readlines()))

                self.localization_cache[country_name] = country

        self.contacter = ContactDetecter()
        self.extor = Extractor()
        self.ad = address_detecter.AddressDetecter(
            cache_results=True, check_supported=check_supported)
        self.tlds = set()
        self.tel = PhoneDetecter()
        self.fetcher = Fetcher(proxy=proxy)
        self.iosfetcher = Fetcher(
            proxy=proxy,
            user_agent=
            "Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3"
        )

        self.wrapper = wrapper
        # Used to tell when to empty the address detecter’s results cache
        # i.e. when we change website
        self.previous_domain = None

        # Allow countries to specify an other country code for phone detection
        self.phone_country = {'UK': 'GB'}

        # TLDin this file are from IANA organisation
        with open(tld_file) as f:
            for tld in f:
                self.tlds.add(tld.strip())

    def get_country(self, country):
        if country not in self.localization_cache:
            if self.check_supported:
                raise NotImplementedError("No resource for country %s" %
                                          country)
            else:
                self.logger.warn("Unsupported country %s" % country)
                return None
        return self.localization_cache[country]

    def extract(self,
                dom,
                raw_txt,
                relevant_txt,
                url,
                firstpage,
                country="FR",
                lang="FR"):
        results = {}

        domain = urlparse(url).hostname
        if self.previous_domain is None or domain != self.previous_domain:
            self.previous_domain = domain
            self.ad.empty_cache()

        if firstpage:
            self.rss_cache = set()

        results["localId"] = self.extract_id(raw_txt, country=country)
        results["phone"], results["fax"] = self.extract_phone(raw_txt,
                                                              country=country)
        results["email"] = self.extract_email(dom,
                                              raw_txt,
                                              domain,
                                              country=country)
        results["contact"] = self.extract_contacts(raw_txt)
        results["legal"] = self.extract_legal(raw_txt)
        results["useterms"] = self.extract_useterms(raw_txt)
        results["rss"] = self.extract_rss(dom, url)
        results["responsive"] = self.extract_responsive(dom)
        results["description"] = self.extract_description(dom)
        results["capital"] = self.extor.extract_capital(raw_txt)
        results["addresses"] = self.ad.detect_addresses(raw_txt,
                                                        html=False,
                                                        country=country)

        # This extraction does an xtra fetch, we only do it for the first page
        if firstpage:
            results["mobile"] = self.extract_mobile(url)

        if self.extract_contactform(dom):
            results["contactform"] = url
        else:
            results["contactform"] = None

        return results

    def extract_mobile(self, url):
        """
        http://www.cabinetnardi.com/mobile/
        http://le-choix-funeraire.mobi/
        http://iphone.revision-et-finance-cogefor.fr
        http://m.agencecomtesse.com
        """
        up = urlparse(url)
        domain = up.scheme + "://" + up.netloc
        fr = self.iosfetcher.fetch(domain)
        if fr is not None and fr.fetched_url != domain:
            if "mobile" in fr.fetched_url or \
                    ".mobi" in fr.fetched_url or \
                    "iphone" in fr.fetched_url or \
                    "//m." in fr.fetched_url:
                return True

        return None

    def extract_responsive(self, dom):
        return len(dom("meta[name='viewport']")) > 0

    def extract_description(self, dom):
        """
        Extracts content from meta description in headers

        param: dom: the dom where to apply extraction
        """
        description = None
        desc = dom("meta[name='description']")

        # TODO: manage og
        # desc = dom("meta[name='og:description']")

        if desc.length > 0:
            description = ""
            for d in desc.items():
                if d is not None and d.attr is not None and \
                   d.attr.content is not None:
                    description += d.attr.content + ' '

        if description is None or not self._validate_description(description):
            return None

        # Remove HTML tags if present, but keep newline tags as newlines
        for tag in HTML5Wrapper.newline_tags:
            regex = "</?%s.*?>" % tag
            description = re.sub(regex, "\n", description, flags=re.I | re.M)

        # Remove remaining tags
        description = re.sub("<.+?>", " ", description, flags=re.M)
        # Remove supernumerary newlines and spaces
        description = re.sub(r"\n{2,}", "\n", description)
        description = re.sub(" {2,}", " ", description)

        return description.strip()

    def _validate_description(self, desc):
        """
        Determines if an extracted description seems to be a quality one.
        """
        badstart = ("site", "bienvenue", "joomla", "wordpress")
        badend = ("...")
        normed = desc.lower().strip()

        if normed.startswith(badstart):
            return False

        if normed.endswith(badend):
            return False

        wf = WFHistogram(normed)
        if len(wf.freq) < 5:
            return False

        return True

    def _find_rss(self, dom, url):
        domain = urlparse(url).netloc
        rsslink = None
        # First looking into head links
        # supports "rss+xml"
        for link in dom("head link[type*='application/rss'][href]").items():
            rsslink = urljoin(url, link.attr.href)
            break

        if rsslink is None:
            for node in dom("a[href]").items():
                href = node.attr.href
                # If this link could be a rss one
                if "rss" in href.lower():
                    rsslink = ""
                    if href.startswith("http"):
                        if domain in url:
                            rsslink = href
                        else:
                            continue
                    # Build absolute link from relative link
                    else:
                        rsslink = urljoin(url, href)

                    break
        # replace feed:// with http://
        if rsslink is not None and rsslink.startswith("feed:"):
            rsslink = rsslink[5:]
            if rsslink.startswith("//"):
                rsslink = "http:" + rsslink
                # supports feed:https:// as well!

        # If the rss feed is unknown, we return it
        if rsslink not in self.rss_cache:
            self.rss_cache.add(rsslink)
            return rsslink
        else:
            return None

    def extract_rss(self, dom, url):
        rsslink = self._find_rss(dom, url)
        # no rss found
        if rsslink is None:
            return (None, None)

        # One a potential RSS link has been found, let's check it out
        return self._compute_rss_stats(rsslink,
                                       self.fetcher.fetch(rsslink, debug=True))

    def _compute_rss_stats(self, rsslink, fr):
        if fr is not None and not (
                "application/xml" in fr.headers["content-type"]
                or "text/xml" in fr.headers["content-type"]
                or "application/rss+xml" in fr.headers["content-type"]):
            return (None, None)

        try:
            rss = self.wrapper.pq(fr.webpage)
        except (lxml.etree.XMLSyntaxError, lxml.etree.ParserError):
            return (rsslink, 0)

        # Now let's get more recent and oldest item dates in stream
        first = last = None
        count = 0
        for entry in rss("item").items():
            count += 1
            date = feedparser._parse_date(entry("pubDate").text())
            if date is not None:
                publication = time.mktime(date)
                if first is None or first < publication:
                    first = publication
                if last is None or last > publication:
                    last = publication

        # Compute ratio items per week
        if first is not None and last is not None:
            timedelta = first - last
            if timedelta > 0:
                weekratio = count / (timedelta / (7 * 24 * 60 * 60))

                return (rsslink, weekratio)

        return (rsslink, 0)

    def extract_legal(self, raw_txt, country="FR"):
        country = self.get_country(country)
        if country is None:
            return None
        low = raw_txt.lower()
        for i in country.legals:
            if i in low:
                return True
        return None

    def extract_useterms(self, raw_txt, country="FR"):
        country = self.get_country(country)
        if country is None:
            return None
        low = raw_txt.lower()
        for i in country.useterms:
            if i in low:
                return True
        return None

    def extract_contactform(self, dom):
        """
        Searches a contact form in page by looking input names in forms.
        """
        """
    Searches a contact form in page. Uses a linear classifier.
    """
        c = ContactFormExtractor(dom)

        if c.predict():
            return True
        else:
            return None

    def extract_id(self, txt, country="FR"):
        """
        Tries to extract ID (siren, siret, TVA, KBO, etc…) from page text
        """

        re_country = self.get_country(country)
        if re_country is None:
            return None

        lower_txt = txt.lower()
        for regex in re_country.identification:
            m = re.search(regex, lower_txt)
            if m is not None:
                ide = re.sub('[^\d]', '', m.group(1))

                # Checking extraction quality
                if country == "BE":
                    if len(ide) < 10:
                        ide = "0" + ide

                    if len(ide) != 10:
                        return None

                elif country == "FR":
                    if len(ide) != 9:
                        return None

                elif country == 'UK':
                    if len(ide) == 7:
                        ide = "0" + ide

                return ide

        return None

    def extract_contacts(self, raw_txt):
        return self.contacter.detect(raw_txt)

    def extract_phone(self, raw_txt, country="FR"):
        """
        Returns a tuple containing :
            - a list of detected phones
            - a list of detected faxes
        """

        phone_country_ = country
        if country in self.phone_country:
            phone_country_ = self.phone_country[country]
        results = self.tel.detect(raw_txt, country=phone_country_)
        phones = [r[1] for r in results if r[0] == "phone"]
        faxes = [r[1] for r in results if r[0] == "fax"]

        return (phones, faxes)

    def _validate_email(self, email, domain, country="FR"):
        """
        Checks out that the email is valid and usable.
        Sorts emails between generic ones and direct contacts.
        param: email: a str believed to be an email
        param: domain: the domain of the analyzed website; used to determine if an email address
                       is really related to the website
        return: a tuple (email, is_contact) where is_contact in [True, False]
                False is for generic contact emails such as [email protected]
        """
        if self.check_supported:
            country = self.get_country(country)

        email = email.strip().lower()

        # We accept at maximum 3 sub-domains of mail
        m = re.search("([\w\.\-]+@[\w\-]+(\.[\w\-]+){1,3})", email)

        if m is not None:
            # email is validated, but let's check it's not a generic email
            email = m.group(1)
            prefix, suffix = email.split('@')

            # Bad suffix (domain.com, example.com...)
            if suffix in self.email_filtering_data["domains_blacklist"]:
                return None

            # Bad tld in extracted email
            if suffix.split(".")[-1] not in self.tlds:
                self.logger.info(">>> TLD refused : %s" % email)
                return None

            # Email prefix in blacklist (CNIL...)
            if prefix in self.email_filtering_data["prefixes_blacklist"]:
                self.logger.info(">>> Blacklisted email prefix found: %s" %
                                 email)
                return None

            # Fuzzy match between the suffix and the domain
            fuzzy_match = fuzz.token_sort_ratio(suffix, domain)
            # This value should be tested against a real database of examples
            fuzzy_threshold = 70
            if fuzzy_match < fuzzy_threshold:
                # Test email providers domains: if we find an email @wanadoo.fr,
                # we can't be sure it's not a real one
                if not any([
                        fuzz.token_sort_ratio(suffix, d) >= fuzzy_threshold
                        for d in self.email_filtering_data["email_providers"]
                ]):
                    return None

            self.logger.info("> found [" + email + "]")

            for pattern in country.generic_emails:
                if re.match(pattern, prefix) is not None:
                    return (email, False)
            return (email, True)

        else:
            self.logger.warning("WARNING>> unvalidated email : " + email)
            return None

    def extract_email(self, dom, raw_txt, domain, country="FR"):
        """
        Tries to extract email adress from mailto structure.
        If nothing found, tries a detection from raw text.
        """
        for node in dom("a[href^='mailto:']").items():
            # <a href="mailto:[email protected]">Clique ici pour m'envoyer un e-mail</a>
            mail = node.attr.href[7:]
            clear = mail.lower().split('?')
            if len(clear) > 0:
                return self._validate_email(clear[0], domain, country)
            else:
                continue

        # If no mailto found, let's try to extract an email from raw text
        # Not a findall for performance reasons
        m = re.search("[\s:]([\w\.\-]+@[\w\.\-]+)[\s\"<]", raw_txt + " ")
        if m is not None:
            return self._validate_email(m.group(1), domain, country)

        return None
Ejemplo n.º 19
0
class ViadeoFetcher:
    LANG = "(com|de|en|es|fr|gb|it)"

    def __init__(self, proxy=None):
        self.fetcher = Fetcher(proxy=proxy)
        self.wrapper = HTML5Wrapper()
        self.normalizer = normalizer.Normalizer()
        self.valid_domains = re.compile("^.*.viadeo." + self.LANG + "$")
        self._rpath1 = re.compile("^\\/" + self.LANG +
                                  "\\/profile\\/([^\\/]+).*$")
        self._rpath2 = re.compile("^\\/r\\/profile\\/([^\\/]+)\\/" +
                                  self.LANG + "\\/public(\\/.*)?$")
        self._rtitle = re.compile("^([^,]+).*$")

    def validate_url(self, domain, path):
        """
    Validates if an url is a viadeo profile or not.
    param: domain: The URL domain
    param: path: The URL path
    return: true/false
    """
        # Valid domain and profile path
        return self.valid_domains.match(domain) is not None and \
               (self._rpath1.match(path) is not None or self._rpath2.match(path) is not None)

    def validate_contact(self, title, firstname, lastname):
        """
    Validates if the profile page corresponds to the specified contact.
    param: title: The page title
    param: firstname: The contact first name
    param: lastname: The contact last name
    return: True if the page corresponds to the specified contact, False otherwise
    """
        # Extract name from title
        title = title.replace("<b>", "").replace("</b>", "")
        m = self._rtitle.search(title)
        # Matching title
        if m is not None:
            return self.normalize_name(
                m.group(1)) == self.normalize_name(firstname + lastname)
            # Invalid
        return False

    def normalize_name(self, name):
        """
    Normalize a name for comparison
    param: name: The name to normalize
    return: The normalized name for comparison (firstname + lastname, lowercase ASCII, withtout separators)
    """
        text = re.sub('[\-0-9\s]+', '', name)
        text = self.normalizer.normalize_text(text)
        return text

    def parse(self, fr):
        html = self.wrapper.pq(fr.webpage)
        lr = ViadeoResult(html, url=fr.fetched_url, wrapper=self.wrapper)
        return lr

    def extract_profile(self, url):
        """
    Fetches profile URL and cleans html.
    """
        # Extract profile
        fr = self.fetcher.fetch(url, debug=False)
        if fr is None or fr.webpage is None:
            return None

        return self.parse(fr)
Ejemplo n.º 20
0
class BingAPIFetcher:
    """
    Fetches Bing results for a given query
    """
    def __init__(self, key, proxy=None):

        self.base_url = "https://api.datamarket.azure.com/Bing/SearchWeb/v1/Web?$format=json&"
        # Building authentification from key
        s = '%s:%s' % (key, key)
        credentials = base64.b64encode(s.encode('utf-8'))
        self.auth = 'Basic %s' % credentials.decode('utf-8')

        # Markets for localized and more accurate search
        self.markets = {
            "FR": "fr-FR",
            "BE": "fr-BE",
            "GB": "en-GB",
            "US": "en-US",
            "DE": "de-DE",
            "UK": "en-GB"
        }

        # Fetcher initialization
        self.fetcher = Fetcher(proxy=proxy)
        self.fetcher.headers["Authorization"] = self.auth

        # Logging initialization
        self.logger = logging.getLogger("webmining:bingapi_fetcher")
        self.logger.setLevel(logging.INFO)
        self.wrapper = HTML5Wrapper()

    def parse(self, webpage, bresults):

        # We check out API account is not empty. This is tricky, as in this case the Bing API
        # speaks in plain-text and no more in json.
        if webpage.strip(
        ) == "Insufficient balance for the subscribed offer in user's account":
            raise EmptyBingAPIAccount(
                "Insufficient balance for the subscribed offer in user's account"
            )
        json_result = json.loads(webpage, encoding="utf-8")
        result_list = json_result['d']['results']

        if webpage is not None:
            for r in result_list:
                br = BingAPIResult(r)
                bresults.append(br)

        return webpage

    def fetch(self, q, start=0, country="FR"):
        """
        Fetches Bing with the query q and sends back
        a list of results.
        param: q: a query, as a string
        param: start: the starting offset (first 50 results are start=0, next 50 start=1, ...)
        param: country of the searched company
        return: a list of BingAPIResult
        """
        bresults = []
        # Simple quote parasite bing query parser
        q = q.replace("'", "")
        query = "'%s" % q
        query += "'"
        query = urllib.parse.urlencode({
            "Query": query,
            "$top": "50",
            '$skip': "%i" % (start * 50),
            'Market': "'%s'" % self.markets[country],
            'Options': "'DisableLocationDetection'",
        })
        url = self.base_url + query
        fr = self.fetcher.fetch(url, debug=False, force_encoding="utf-8")

        self.logger.debug("Fetched url [%s] start=%s" % (url, start))
        if fr is None or fr.webpage is None:
            self.logger.warn("Got nothing from [%s]" % url)
            return bresults

        self.logger.debug("Returned result - " + str(fr.fetched_url))
        self.parse(fr.webpage, bresults)

        self.logger.info("Fetched [%s] with %d results" % (url, len(bresults)))

        return bresults
Ejemplo n.º 21
0
 def __init__(self, tld="fr", proxy=None):
     self.fetcher = Fetcher(proxy=proxy)
     self.wrapper = HTML5Wrapper()
     self.base_url = "http://www.google.%s/search?rls=en&ie=UTF-8&oe=UTF-8&" % tld
Ejemplo n.º 22
0
class Crawler:
    """
    A generic crawler.
    """
    def __init__(self,
                 filename=None,
                 seedlist=None,
                 debug=False,
                 proxy=None,
                 multiproc=True,
                 mode=CrawlMode.entire,
                 max_page_size=PAGE_SIZE_LIMIT):
        """
        :param filename: path to the seed file
        :param mode: crawling mode, either "entire", "single", "subpath"
        """
        self.seed = None
        self.debug = debug
        # init the fetcher with a download limit size
        self.fetcher = Fetcher(proxy, max_page_size=max_page_size)
        self.htmltools = HTML5Wrapper()
        self.crawl_depth = 0  # Do we crawl domains outside the seed
        self.domain_depth = 0  # At which depth each seed element must be crawled
        self.page_limit = 0  # Max amount of pages to be crawled
        self.max_page_size = max_page_size
        self.website = Website()
        self.me = MetaExtractor(proxy=proxy)

        self.badextensions = set(["pdf", "xls", "doc", "ppt", "rtf", "odt", "zip", "tar.gz", "tar", "exe", \
                                  "jpg", "png", "jpeg", "bmp", "gif", "mp3", "flv", "rar", "ogv", "avi", "mp4", \
                                  "mkg", "ps", "ogg", "webm", "ogm", "pps", "pptx", "docx", "xlsx", "mpg", "mov", \
                                  "mkv", "mpeg", "m4v", "iso"])

        self.crawling_process_over = False

        # Logging initialization
        self.logger = logging.getLogger("webmining:crawler")
        self.logger.setLevel(logging.INFO)
        if debug:
            self.logger.setLevel(logging.DEBUG)
        self.filename = filename
        self.seedlist = seedlist
        self.mode = mode
        self.authorized_domains = set()

    def _monitore_processes(self, processes):
        """
        Checks if subcrawling processes are over.
        This method is meant to be used wrapped into a Thread.
        """
        for p in processes:
            p["event"].wait()

        self.crawling_process_over = True

    def spawn_crawl_processes(self, html2txt, metas, proc, wait_courtesy):
        processes = []
        for i in range(0, proc):
            e = Event()
            p = Process(None, self._sub_crawl, None, (), {"queue": self.seed.q, "storage": self.storage, "end_event": e, \
                                                          "wait": wait_courtesy, "html2txt": html2txt, "metas": metas})
            p.start()

            processes.append({"proc": p, "event": e, "id": i})
        monitor = Thread(group=None,
                         target=self._monitore_processes,
                         name=None,
                         args=(),
                         kwargs={"processes": processes})
        monitor.start()
        while not self.crawling_process_over:
            # If all processes are over, or if getting an element
            # from queue takes more than timeout seconds (which seems empirically abnormal)
            # then crawl is finished.
            c = 0
            for p in processes:
                if not p["proc"].is_alive():
                    c += 1

            if c >= len(processes):
                self.logger.warning("All processes are dead !")
                break

            try:
                el = self.storage.get(block=True, timeout=5)
                yield el
            except Empty:
                if self.storage.empty():
                    pass
        self.logger.debug("joining processes...")
        for p in processes:
            if p["proc"].is_alive():
                p["proc"].terminate()

            p["proc"].join()

        # Finally, joining monitoring thread
        monitor.join(3)
        if monitor.is_alive():
            monitor._stop()

    def crawl(self,
              proc=None,
              domain_depth=0,
              crawl_depth=0,
              page_limit=None,
              wait_courtesy=0,
              html2txt=False,
              metas=None):
        """
        :param proc:           amount of processes to spawn, 0 or None can be used to exploit the current process
        :param domain_depth:   crawling depth for each seed element (inside original domain)
        :param crawl_depth:    crawling depth for each seed element (outside original domain)
        :param page_limit:     max amount of page to crawl
        :param wait_courtesy:  time in second between each fetch
        :param html2txt:       resulting pages must be raw html (default), or cleant txt
        :param metas:          metas we want to extract during crawling
        """

        self.domain_depth = domain_depth
        self.crawl_depth = crawl_depth
        self.page_limit = page_limit
        # lazy loading, to know if we need to implement seeds with multiproc or not
        if self.seed is None:
            if self.filename is not None:
                self.seed = Seed(f=self.filename,
                                 multiproc=not (proc is None or proc == 0))
            elif self.seedlist is not None:
                self.seed = Seed(s=self.seedlist,
                                 multiproc=not (proc is None or proc == 0))

        if proc is None or proc == 0:
            self.storage = Queue()  # Will contain shared crawl results
            self._sub_crawl(self.seed.q, self.storage, Event(), wait_courtesy,
                            html2txt, metas, None)
            while True:
                try:
                    el = self.storage.get(block=False)
                    yield el
                except Empty:
                    break
        else:
            self.storage = MPQueue()  # Will contain shared crawl results
            yield from self.spawn_crawl_processes(html2txt, metas, proc,
                                                  wait_courtesy)

    def _sub_crawl(self,
                   queue,
                   storage,
                   end_event,
                   wait,
                   html2txt,
                   metas,
                   block_timeout=5):
        """
        This private method will be wrapped into a process,
        and is in charge of dequeuing seed elements, and recording results into
        the storage.
        """
        while True:
            se = None
            pages = []
            try:
                se = queue.get(block=block_timeout is not None,
                               timeout=block_timeout)
            except Empty:
                end_event.set()
                return

            self.logger.info("Launched crawl [%s]" % se.url)
            start_url = se.url  # Need to keep it as it may change due to redirect
            pages = self.crawl_domain(se, self.domain_depth, wait, html2txt,
                                      self.page_limit, self.mode)
            self.logger.info("Crawl over with %d pages [%s]" %
                             (len(pages),
                              (se.url if start_url in se.url else '%s -> %s' %
                               (start_url, se.url))))

            first = True
            for url in pages:
                se = pages[url]
                ext_metas = {}

                # Extract asked metas from page
                if metas is not None:
                    try:
                        ext_metas = self.me.extract(metas, se.html, se.relevant_txt, \
                                                    url=url, firstpage=first)
                        first = False
                    except MetaExtractionException as e:
                        self.logger.warning(
                            "Impossible to extract metas in [%s]: " % url)
                        self.logger.warning(e)
                        continue

                    for m in ext_metas:
                        if ext_metas[m] is not None:
                            if m not in se.metas.keys():
                                if m in ["contact", "phone", "fax"]:
                                    se.metas[m] = []
                                else:
                                    se.metas[m] = set()

                            if m in ["contact", "phone", "fax"]:
                                se.metas[m].extend(ext_metas[m])
                            else:
                                se.metas[m].add(ext_metas[m])

                storage.put(se)

            # Let's save memory
            del pages

            if self.crawl_depth > 0:
                # TODO: create new seed elements to put in queue when crawl deeper than 0
                # with an updated depth, domain, etc...
                raise Exception("Not implemented")

    def _check_first_page(self, dom, url):
        """
        Checks if domain first page is
          - a html redirection
          - a frameset

        returns an url to follow, or None if nothing detected.
        """
        # we check out if it contains a <meta http-equiv="refresh"
        # ex. <meta http-equiv="Refresh" content="0; URL=corporate-finance/corporate-finance-presentation.html">
        metas = dom(
            "meta[http-equiv='refresh'][content], meta[http-equiv='Refresh'][content], meta[http-equiv='REFRESH'][content]"
        )
        #raise Exception("type of metas : " + str(type(metas)) + "\n" + str(dir(metas)))
        base_url = self._get_base_url(dom, url)

        for m in metas.items():
            content = m.attr.content

            m = re.search("url\s?=\s?(.*?)\s", content + ' ', flags=re.I)
            if m is not None:
                rurl = m.group(1).strip()
                rurl = urllib.parse.urljoin(base_url, rurl)

                self.logger.info("HTTP redirection to [%s]" % rurl)
                return rurl

        # We check out if it contains a <frame src="..."
        # and only return first found url if true
        # TODO: is it relevant to return only the first frame?
        frames = dom("frame[src]")
        for f in frames.items():
            rurl = urllib.parse.urljoin(base_url, f.attr.src)
            self.logger.info("FRAME redirection to [%s]" % rurl)
            return rurl

        # We check out if it contains a JS redirection document.location.href=
        # and only return first found url if true
        scripts = dom("script")
        for s in scripts.items():
            js = s.text()
            if js is not None:
                m = re.search(
                    "document.location.href\s?=\s?[\"']([^\"]*?)[\"']\s*[^+]",
                    js + " ",
                    flags=re.I)

                if m is not None:
                    rurl = urllib.parse.urljoin(base_url, m.group(1).strip())
                    self.logger.info("JavaScript redirection to [%s]" % rurl)
                    return rurl

        return None

    def _verify_and_parse_result(self, fresult, seed_el):
        """
        Verify if a fetch result is valid for parsing. If so, it will build the pq element that correspond to the webpage

        :param fresult: FetchResult object
        :param seed_el: SeedElement object
        :return: The pq element that correspond
        """
        if fresult is None:
            return None
        html = fresult.webpage
        content_type = fresult.content_type

        # in case of 300/302 we use final url given by fetcher
        seed_el.url = fresult.fetched_url

        if fresult.http_status is None or fresult.http_status != 200:
            self.logger.warning("Bad HTTP Status (%s) for [%s]" %
                                (str(fresult.http_status), seed_el.url))
            return None

        if html is None:
            self.logger.warning("Impossible to crawl [%s]" % seed_el.url)
            # Missed page not ignored, as this kind of websites can be dangerous
            return None

        # We only want to compute text/html webpages
        if content_type is not None and "text/html" not in content_type.lower(
        ):
            self.logger.info("Content Type ignored : " + str(content_type) +
                             " [" + seed_el.url + "]")
            return None

        # Too large file
        self.logger.debug("Page size of %d characters" % len(html))
        if len(html) > self.max_page_size:
            self.logger.warning("Page ignored, too big (%d characters) in %s" %
                                (len(html), seed_el.url))
            return None

        # Is an attachment, so we must ignore it
        if fresult.attachment is not None:
            self.logger.warning(
                "Page ignored, because it correspond to the attachment %s [%s]"
                % (fresult.attachment, seed_el.url))
            return None

        if len(html) == 0:
            self.logger.warning("Page ignored because it is empty [%s]" %
                                seed_el.url)
            return None

        try:
            dom = self.htmltools.pq(html)
        except Exception as e:
            self.logger.warning("Impossible to parse html url=%s : %s" %
                                (fresult.fetched_url, str(e)))
            return None
        # DEACTIVATED FEATURE
        # Test to see if the root node is a html node
        # if dom[0].tag.lower() != 'html':
        # self.logger.warning("Page is not a valid html [%s]" % seed_el.url)
        # return None
        return dom

    @staticmethod
    def _generate_authorized_domains(domain):
        domain = domain.lower()  # Force lower case
        auth = set([domain])
        if "www." in domain:
            auth.add(domain.replace("www.", ""))
        else:
            auth.add("www." + domain)

        comdom = {
            dom.rsplit(".", maxsplit=1)[0] + ".com"
            for dom in auth if ".com" not in dom
        }
        auth.update(comdom)

        return auth

    def _is_authorized_subpath(self, init_url, target_url):
        # Force Lower case
        init_url = init_url.lower() if init_url is not None else init_url
        target_url = target_url.lower(
        ) if target_url is not None else target_url

        init_path = urllib.parse.urlparse(init_url).path
        target_url_parsed = urllib.parse.urlparse(target_url)
        target_domain, target_path = target_url_parsed.netloc, target_url_parsed.path

        if target_domain in self.authorized_domains and target_path.startswith(
                init_path):
            return True
        return False

    def crawl_domain(self,
                     init_seed_el,
                     max_dom_depth,
                     wait,
                     html2txt,
                     limit=None,
                     mode=CrawlMode.entire):
        """
        Fetches a domain, and then crawls its internal pages until given depth.
        Returns a dictionary of url -> html code.
        """
        pages = {}
        visited = set()  # Already visited URLs
        found_links = [
            init_seed_el
        ]  # List of found links as SeedElements, waiting to be fetched
        #overides the limit to crawl only one page
        if mode == CrawlMode.single:
            limit = 1
            max_dom_depth = 1
        self.logger.info("Launching crawl in the %s mode" % mode.value)
        # -- Managing authorized domains for this crawl --
        domain = urllib.parse.urlparse(init_seed_el.url).netloc
        self.authorized_domains = self._generate_authorized_domains(domain)
        self.logger.info("Authorized domains for this crawl : %s" %
                         str(self.authorized_domains))

        # Looping through found urls
        while True:
            if limit is not None and len(visited) > limit:
                self.logger.info("Max amount of pages reached ! (%d)" % limit)
                return pages

            self.logger.debug("%d url visited so far" % len(visited))

            seed_el = None  # Current element being computed, in while loop
            try:
                while True:
                    seed_el = found_links.pop(0)
                    if seed_el.url not in visited:
                        break
                visited.add(
                    seed_el.url)  # A popped element is considered visited
            except IndexError:
                self.logger.info("No more links to visit for this website.")
                return pages

            # Fetching URL given in seed element in param
            self.logger.debug("Fetching " + seed_el.url)

            fresult = None
            retry = 0
            max_retry = 2  # TODO - VYS - Make this configurable
            while fresult is None and retry <= max_retry:
                try:
                    fresult = self.fetcher.fetch(seed_el.url,
                                                 self.debug,
                                                 timeout=10)
                    # If we're here it means that no more retry are needed, disable it
                    retry = max_retry + 1
                except Timeout:
                    self.logger.warning(
                        "Timeout while fetching %s%s" %
                        (seed_el.url, (", lets retry (max retry %s)" %
                                       max_retry) if retry == 0 else
                         (" - retry %s/%s" % (retry, max_retry))))
                    retry += 1
                    continue

            if fresult is None:
                continue

            if wait > 0:
                time.sleep(wait)

            # Lets do a quick check if we don't get a redirect
            rurl30X = None
            if fresult.fetched_url != seed_el.url:
                rurl30X = fresult.fetched_url
                self.logger.warning("Got a redirect to %s when fetching %s" %
                                    (fresult.fetched_url, seed_el.url))
            dom = self._verify_and_parse_result(fresult, seed_el)
            if dom is None:
                self.logger.warning("Found no DOM for %s" % seed_el.url)
                continue

            # normalize root urls to avoid a double visit at http://www.example.com/ and http://www.example.com
            path = urllib.parse.urlparse(seed_el.url).path
            if path == '':
                seed_el.url += '/'

            self.logger.debug("Fetched [%s] " % seed_el.url)

            # If this page is the first one for this domain,
            # we check out if it contains a <meta http-equiv="refresh"
            # The same if this page is the second one,
            # because sometimes a redirection is followed by a frame
            if len(visited) < 2:
                rurl = self._check_first_page(dom, seed_el.url)
                rurl = rurl if rurl is not None else rurl30X
                if rurl is not None:
                    domain = urllib.parse.urlparse(rurl).netloc
                    domain = domain.lower()
                    # If we are following a redirect, we also add it to the set of authorized domains
                    # to be able to follow next urls.
                    self.authorized_domains.add(domain)
                    if "www." in domain:
                        self.authorized_domains.add(domain.replace("www.", ""))
                    else:
                        self.authorized_domains.add("www." + domain)

                    self.logger.info(
                        "New authorized domains for this crawl : %s" %
                        str(self.authorized_domains))

                    if seed_el.url in visited:
                        pass
                    else:
                        visited.add(seed_el.url)

                        # Adding detected url to follow
                        ser = SeedElement(rurl, seed_el.groupid)
                        ser.depth = seed_el.depth + 1
                        found_links.append(ser)

            # If the new page url, after redirections, is outside authorized domains, don't use it
            if urllib.parse.urlparse(
                    seed_el.url).netloc.lower() not in self.authorized_domains:
                self.logger.warning(
                    "redirection to %s don't exits from authorized domains, page not analyzed"
                    % seed_el.url)
                continue
            if mode == CrawlMode.subpath and not self._is_authorized_subpath(
                    init_seed_el.url, seed_el.url):
                self.logger.warning(
                    "subpath mode: redirection to %s exists from authorized subpaths, page not analyzed"
                    % seed_el.url)
                continue
            # ---
            # HTML computing
            # ---
            # Converting html into "clean" and interesting text
            relevant_txt = self.website.extract_meaningful_text(dom)

            # Builds a new Seed Element from popped element
            se = SeedElement(seed_el.url, seed_el.groupid)
            se.depth = seed_el.depth
            se.relevant_txt = relevant_txt
            if fresult is not None:
                se.html = fresult.webpage
                se.content_type = fresult.content_type
                se.charset = fresult.charset
                se.http_status = fresult.http_status
                se.headers = fresult.headers

            # Sometimes DOM is too deep to extract title properly
            se.title = self.website.extract_title(dom)

            pages[seed_el.url] = se
            visited.add(
                seed_el.url
            )  # May be different from original, cause of redirections

            # This page has been computed, let's now extract its links
            # if ymax depth not reached
            if seed_el.depth + 1 > max_dom_depth:
                continue
            if mode != CrawlMode.single:
                found_links.extend(
                    self._extract_links(dom, init_seed_el, seed_el, visited,
                                        mode))

        self.logger.debug("Out of while loop.")
        return pages

    def _get_base_url(self, dom, url):
        # check if there is a 'base' tag for link compute
        base_url = dom('base').attr('href')
        if base_url is None:
            base_url = url
        return base_url

    def _extract_links(self, dom, init_seed_el, seed_el, visited, mode):
        """
        Given a dom, extract internal links to crawl
        """

        # ---
        # Link extraction and checking
        # ---
        links = {}
        selected_links = []
        added = set()

        # DOM is sometimes to deep to extract links properly
        try:
            links = self.htmltools.extract_doc_links(dom)
        except Exception as e:
            links = {}
            self.logger.warning("Impossible to extract links from %s : %s" %
                                (seed_el.url, str(e)))

        base_url = self._get_base_url(dom, seed_el.url)
        for key in links:
            # We do not want anchors to be crawled
            key = key.split("#")[0]
            if len(key) < 1:
                continue

            url = None
            try:
                url = urllib.parse.urljoin(base_url, key)
            except Exception as e:
                # Invalid url, ignoring
                self.logger.warning("Invalid urljoin (%s,%s): %s" %
                                    (base_url, key, str(e)))
                continue

            # Trying to get eventual file extension, and to check its validity
            path = urllib.parse.urlparse(url).path
            if path == '':
                url += '/'
            else:
                ext = path.split('.')[-1].strip().lower()
                if ext in self.badextensions:
                    self.logger.debug("Bad extension [%s] in %s" % (ext, url))
                    continue

            # Let's check if it's an internal link, and not an outgoing one
            if urllib.parse.urlparse(url).netloc.lower() in self.authorized_domains and \
                            url not in visited and url not in added:
                if mode == CrawlMode.subpath and not self._is_authorized_subpath(
                        init_seed_el.url, url):
                    continue
                se = SeedElement(url, seed_el.groupid)
                se.depth = seed_el.depth + 1
                selected_links.append(se)
                added.add(url)

        return selected_links