Ejemplo n.º 1
0
    def __init__(self, url, config=None, extractor=None, **kwargs):
        if (url is None) or ('://' not in url) or (url[:4] != 'http'):
            raise ValueError('Input url is bad!')

        self.config = config
        self.config = utils.extend_config(self.config, kwargs)

        self.extractor = extractor

        self.url = url
        self.url = urls.prepare_url(url)

        self.domain = urls.get_domain(self.url)
        self.scheme = urls.get_scheme(self.url)

        self.categories = []
        self.feeds = []
        self.articles = []

        self.html = ''
        self.doc = None

        self.logo_url = ''
        self.favicon = ''
        self.brand = 'datahub'
        self.description = ''

        self.is_parsed = False
        self.is_downloaded = False
Ejemplo n.º 2
0
    def __init__(self, url, config=None, **kwargs):
        """The config object for this source will be passed into all of this
        source's children articles unless specified otherwise or re-set.
        """
        if (url is None) or ('://' not in url) or (url[:4] != 'http'):
            raise Exception('Input url is bad!')

        self.config = config or Configuration()
        self.config = utils.extend_config(self.config, kwargs)

        self.extractor = IntiExtractor(self.config)

        self.url = url
        self.url = urls.prepare_url(url)

        self.domain = urls.get_domain(self.url)
        self.scheme = urls.get_scheme(self.url)

        self.categories = []
        self.feeds = []
        self.articles = []

        self.html = ''
        self.doc = None

        self.logo_url = ''
        self.favicon = ''
        self.brand = tldextract.extract(self.url).domain
        self.description = ''

        self.is_parsed = False
        self.is_downloaded = False
Ejemplo n.º 3
0
    def get_category_urls(self, source_url, doc):
        """Inputs source lxml root and source url, extracts domain and
        finds all of the top level urls, we are assuming that these are
        the category urls.
        cnn.com --> [cnn.com/latest, world.cnn.com, cnn.com/asia]
        """
        page_urls = self.get_urls(doc)
        valid_categories = []
        for p_url in page_urls:
            scheme = urls.get_scheme(p_url, allow_fragments=False)
            domain = urls.get_domain(p_url, allow_fragments=False)
            path = urls.get_path(p_url, allow_fragments=False)

            if not domain and not path:
                if self.config.verbose:
                    print('elim category url %s for no domain and path'
                            % p_url)
                    continue
            if path and path.startswith('#'):
                if self.config.verbose:
                    print('elim category url %s path starts with #' % p_url)
                continue
            if scheme and (scheme != 'http' and scheme != 'https'):
                if self.config.verbose:
                    print(('elim category url %s for bad scheme, '
                        'not http nor https' % p_url))
                    continue

            if domain:
                child_tld = tldextract.extract(p_url)
                domain_tld = tldextract.extract(source_url)
                child_subdomain_parts = child_tld.subdomain.split('.')
                subdomain_contains = False
                for part in child_subdomain_parts:
                    if part == domain_tld.domain:
                        if self.config.verbose:
                            print(('subdomain contains at %s and %s' %
                                (str(part), str(domain_tld.domain))))
                            subdomain_contains = True
                        break

                # Ex. microsoft.com is definitely not related to
                # espn.com, but espn.go.com is probably related to espn.com
                if not subdomain_contains and \
                        (child_tld.domain != domain_tld.domain):
                            if self.config.verbose:
                                print(('elim category url %s for domain '
                                    'mismatch' % p_url))
                                continue
                elif child_tld.subdomain in ['m', 'i']:
                    if self.config.verbose:
                        print(('elim category url %s for mobile '
                            'subdomain' % p_url))
                        continue
                else:
                    valid_categories.append(scheme+'://'+domain)
                    # TODO account for case where category is in form
                    # http://subdomain.domain.tld/category/ <-- still legal!

        stopwords = [
                'about', 'help', 'privacy', 'legal', 'feedback', 'sitemap',
                'profile', 'account', 'mobile', 'sitemap', 'facebook', 'myspace',
                'twitter', 'linkedin', 'bebo', 'friendster', 'stumbleupon',
                'youtube', 'vimeo', 'store', 'mail', 'preferences', 'maps',
                'password', 'imgur', 'flickr', 'search', 'subscription', 'itunes',
                'siteindex', 'events', 'stop', 'jobs', 'careers', 'newsletter',
                'subscribe', 'academy', 'shopping', 'purchase', 'site-map',
                'shop', 'donate', 'newsletter', 'product', 'advert', 'info',
                'tickets', 'coupons', 'forum', 'board', 'archive', 'browse',
                'howto', 'how to', 'faq', 'terms', 'charts', 'services',
                'contact', 'plus', 'admin', 'login', 'signup', 'register',
                'developer', 'proxy']

        _valid_categories = []

        # TODO Stop spamming urlparse and tldextract calls...

        for p_url in valid_categories:
            path = urls.get_path(p_url)
            subdomain = tldextract.extract(p_url).subdomain
            conjunction = path + ' ' + subdomain
            bad = False
            for badword in stopwords:
                if badword.lower() in conjunction.lower():
                    if self.config.verbose:
                        print(('elim category url %s for subdomain '
                            'contain stopword!' % p_url))
                        bad = True
                    break
            if not bad:
                _valid_categories.append(p_url)

        _valid_categories.append('/')  # add the root

        for i, p_url in enumerate(_valid_categories):
            if p_url.startswith('://'):
                p_url = 'http' + p_url
                _valid_categories[i] = p_url

            elif p_url.startswith('//'):
                p_url = 'http:' + p_url
                _valid_categories[i] = p_url

            if p_url.endswith('/'):
                p_url = p_url[:-1]
                _valid_categories[i] = p_url

        _valid_categories = list(set(_valid_categories))

        category_urls = [urls.prepare_url(p_url, source_url)
                for p_url in _valid_categories]
        category_urls = [c for c in category_urls if c is not None]
        return category_urls