def __init__(self, allow = (), deny = (), allow_domains = (), deny_domains = (), restrict_xpaths = (),
              tags = ('a', 'area'), attrs = ('href'), canonicalize = True, unique = True, process_value = None, check_url = True):
     #Add check_url parameter
     self.check_url = check_url
     
     SgmlLinkExtractor.__init__(self, allow = allow, deny = deny, allow_domains = allow_domains, deny_domains = deny_domains, restrict_xpaths = restrict_xpaths,
                                tags = tags, attrs = attrs, canonicalize = canonicalize, unique = unique, process_value = process_value)
Example #2
0
    def __init__(self,
                 allow=(),
                 deny=(),
                 allow_domains=(),
                 deny_domains=(),
                 restrict_xpaths=(),
                 tags=('a', 'area'),
                 attrs=('href'),
                 canonicalize=True,
                 unique=True,
                 process_value=None,
                 check_url=True):
        # Add check_url parameter
        self.check_url = check_url

        SgmlLinkExtractor.__init__(self,
                                   allow=allow,
                                   deny=deny,
                                   allow_domains=allow_domains,
                                   deny_domains=deny_domains,
                                   restrict_xpaths=restrict_xpaths,
                                   tags=tags,
                                   attrs=attrs,
                                   canonicalize=canonicalize,
                                   unique=unique,
                                   process_value=process_value)
Example #3
0
    def __init__(self,
                 allow=(),
                 deny=(),
                 allow_domains=(),
                 deny_domains=(),
                 restrict_xpaths=(),
                 tags=('a', 'area'),
                 attrs=('href'),
                 canonicalize=True,
                 unique=True,
                 process_value=None,
                 ignore_set=set()):

        self.ignore_set = ignore_set

        SgmlLinkExtractor.__init__(self,
                                   allow=allow,
                                   deny=deny,
                                   allow_domains=allow_domains,
                                   deny_domains=deny_domains,
                                   restrict_xpaths=restrict_xpaths,
                                   tags=tags,
                                   attrs=attrs,
                                   canonicalize=canonicalize,
                                   unique=unique,
                                   process_value=process_value)
Example #4
0
 def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
              tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
              deny_extensions=None, seen_urls=[]):
     SgmlLinkExtractor.__init__(self,allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, 
              tags=tags, attrs=attrs, canonicalize=canonicalize, unique=unique, process_value=process_value,
              deny_extensions=deny_extensions)
     
     for l in seen_urls: self.seen_urls[l]=True
Example #5
0
    def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), 
                 tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
                 ignore_set=set()):

        self.ignore_set = ignore_set

        SgmlLinkExtractor.__init__(self, allow=allow, deny=deny,
                allow_domains=allow_domains, deny_domains=deny_domains,
                restrict_xpaths=restrict_xpaths, tags=tags, attrs=attrs,
                canonicalize=canonicalize, unique=unique, process_value=process_value)
    def __init__(self, *args, **kwargs):
        self.allow_range = kwargs.pop('allow_range', None)
        self.deny_range = kwargs.pop('deny_range', None)

        SgmlLinkExtractor.__init__(self, *args, **kwargs)
Example #7
0
    def __init__(self, *args, **kwargs):
        self.allow_range = kwargs.pop('allow_range', None)
        self.deny_range = kwargs.pop('deny_range', None)

        SgmlLinkExtractor.__init__(self, *args, **kwargs)
Example #8
0
 def __init__(self, allow, restrict_xpaths=()):
     BuggySgmlLinkExtractor.__init__(self, allow=allow, restrict_xpaths=restrict_xpaths)