Example #1
0
 def startElementNS(self, name, qname, attrs):
     stack = self.stack
     stack.append(ElementHandler())
     current = self.current
     parent = self.parent
     base = attrs.get(BASE, None)
     if base is not None:
         base, frag = urldefrag(base)
         if parent and parent.base:
             base = urljoin(parent.base, base)
         else:
             systemId = self.locator.getPublicId() \
                 or self.locator.getSystemId()
             if systemId:
                 base = urljoin(systemId, base)
     else:
         if parent:
             base = parent.base
         if base is None:
             systemId = self.locator.getPublicId() \
                 or self.locator.getSystemId()
             if systemId:
                 base, frag = urldefrag(systemId)
     current.base = base
     language = attrs.get(LANG, None)
     if language is None:
         if parent:
             language = parent.language
     current.language = language
     current.start(name, qname, attrs)
Example #2
0
    def get(self):
        url = request.args.get('url')
        tags = request.args.getlist('tag')

        filters = [db.Bookmark.user == current_user.id]
        if url is not None:
            filters.append(db.Bookmark.url == urldefrag(url).url)

        # If Bookmark.tags is null, filtering will yield no results
        if tags:
            filters.append(db.Bookmark.tags.contains(tags))

        result = db.Bookmark.query.filter(*filters) \
                                  .order_by(
                                      db.Bookmark.read.desc().nullsfirst(),
                                      db.Bookmark.timestamp.desc()) \
                                  .paginate()
        headers = {}
        links = []
        if result.has_next:
            last_url = update_query(request.url, {'page': result.pages})
            links.append(lh.Link(last_url, rel='last'))

        if links:
            headers['Link'] = lh.format_links(links)
        return list(map(lambda x: x.to_dict(), result.items)), 200, headers
Example #3
0
    def post(self, id):
        update = request.get_json()
        if 'id' in update:
            return {'error': 'Updating id is not allowed'}, \
                HTTPStatus.BAD_REQUEST

        bookmark = db.Bookmark.query \
                              .filter_by(id=id, user=current_user.id) \
                              .first()
        if bookmark is None:
            return {'error': 'Not found'}, HTTPStatus.NOT_FOUND

        if 'url' in update:
            bookmark.url = urldefrag(update['url']).url
        if 'title' in update:
            bookmark.title = update['title']
        if 'timestamp' in update:
            bookmark.timestamp = aniso8601.parse_datetime(update['timestamp'])
        if 'read' in update:
            if update['read']:
                bookmark.read = aniso8601.parse_datetime(update['read'])
            else:
                bookmark.read = None
        if 'tags' in update:
            bookmark.tags = update['tags']

        db.db.session.add(bookmark)
        db.db.session.commit()

        return bookmark.to_dict(), HTTPStatus.OK
 def get_urls(self, postings: list):
     urls = []
     for posting in postings:
         defraged = urldefrag(self.id_file[posting.doc_id])[0]
         if defraged not in urls:
             urls.append(defraged)
     return urls
Example #5
0
    def __init__(self, request, timeout=180):
        self._url = urldefrag(request.url)[0]
        # converting to bytes to comply to Twisted interface
        self.url = to_bytes(self._url, encoding='ascii')
        self.method = to_bytes(request.method, encoding='ascii')
        self.body = request.body or None
        self.headers = Headers(request.headers)
        self.response_headers = None
        self.timeout = request.meta.get('download_timeout') or timeout
        self.start_time = time()
        self.deferred = defer.Deferred().addCallback(self._build_response,
                                                     request)

        # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
        # to have _disconnectedDeferred. See Twisted r32329.
        # As Scrapy implements it's own logic to handle redirects is not
        # needed to add the callback _waitForDisconnect.
        # Specifically this avoids the AttributeError exception when
        # clientConnectionFailed method is called.
        self._disconnectedDeferred = defer.Deferred()

        self._set_connection_attributes(request)

        # set Host header based on url
        self.headers.setdefault('Host', self.netloc)

        # set Content-Length based len of body
        if self.body is not None:
            self.headers['Content-Length'] = len(self.body)
            # just in case a broken http/1.1 decides to keep connection alive
            self.headers.setdefault("Connection", "close")
        # Content-Length must be specified in POST method even with no body
        elif self.method == b'POST':
            self.headers['Content-Length'] = 0
Example #6
0
def parse_scm(collection, version):
    """Extract name, version, path and subdir out of the SCM pointer."""
    if ',' in collection:
        collection, version = collection.split(',', 1)
    elif version == '*' or not version:
        version = 'HEAD'

    if collection.startswith('git+'):
        path = collection[4:]
    else:
        path = collection

    path, fragment = urldefrag(path)
    fragment = fragment.strip(os.path.sep)

    if path.endswith(os.path.sep + '.git'):
        name = path.split(os.path.sep)[-2]
    elif '://' not in path and '@' not in path:
        name = path
    else:
        name = path.split('/')[-1]
        if name.endswith('.git'):
            name = name[:-4]

    return name, version, path, fragment
Example #7
0
def UrlQueueFilter(url, currentUrl, filterArr, inputthread):
    newlist = sum(filterArr, [])
    global SeenDB
    isseen, SeenDB, k = DBCtrl.checkinSeenDB(url, SeenDB)
    if (isseen == 1):
        return False
    elif (isseen == -1):
        DBCtrl.resetSeenDB(inputthread)
        isseen, SeenDB, k = DBCtrl.checkinSeenDB(url, SeenDB)
    for filterItem in newlist:
        if (filterItem in url):
            return False
    parseURL = urlparse(url).netloc.split('.')
    #print(url)
    #print(parseURL)
    for i in range(len(parseURL)):
        if (len(parseURL[i]) > 63):
            return False
    if ('https://' in url):
        pass
    elif ('http://' in url):
        pass
    else:
        url = urljoin(currentUrl, url)
    newurl, dummy_frag = urldefrag(url)
    return newurl
Example #8
0
def extract_next_links(url, resp):  # specifications number 3.2, 3.3
    next_links = set()
    base_url = urlparse(url)
    valid = [200, 201, 203]  # list of valid html status codes https://www.w3.org/Protocols/HTTP/HTRESP.html
    if resp.status in valid:
        if url in uniqueDomains:
            return list()

        resp_content = resp.raw_response.headers['Content-Type'].split(';')[0]
        if resp_content != "text/html":
            return list()  # found on stackoverflow on how to read headers

        # uniqueDomains.add(url)
        soup = BeautifulSoup(resp.raw_response.content, "lxml")  # using lxml to read content
        if resp.status == 200 and str(soup) == "":  # making sure website is not empty
            return list()

        for link in soup.find_all("a"):  # uses beautifulsoup to find urls
            link = link.get("href")
            if link is None or link == "":
                continue
            else:
                link = link.lower()  # making link lowercase in order to defragment

            defragmented_url = urldefrag(link)[0]
            fixed_link = fix_url(defragmented_url, base_url)  # need to fix relative links
            if fixed_link not in uniqueDomains:  # makes sure only unique domains are being crawled
                uniqueDomains.add(fixed_link)
                next_links.add(fixed_link)
            else:
                continue
    return list(next_links)  # returns list of set of links (makes sure links are unique)
Example #9
0
def extract_domains(site_text):
    domains = set()
    only_a_tags = SoupStrainer("a")
    for link in BeautifulSoup(site_text, "html.parser", parse_only=only_a_tags):
        if link.has_attr('href') and urlparse(link["href"]).scheme not in ["", "mailto"]:
            domains.add(urldefrag(link["href"])[0])
    return list(domains)
Example #10
0
    def _test_link(self, link, external):
        """Check single link. Either local or remote"""
        base_link, fragment = urldefrag(link)
        if link in self.seen_links:
            return self.seen_links[link]

        if base_link in self.seen_links and not self.seen_links[base_link]:
            return False

        ret = False
        if external:
            if fragment:
                # test with HTTP GET and read to soup to check for fragment anchor
                ret = self._test_http_fragment(base_link, fragment)
            else:
                ret = test_http_head(link)
        else:
            if fragment:
                # read file to soup to check for fragment anchor
                ret = self._test_file_fragment(base_link, fragment)
            else:
                # just stat file
                ret = os.path.exists(link)

        self.seen_links[link] = ret
        return ret
Example #11
0
def url_to_overrides(url_string):
    url = urlparse(url_string)
    if url.scheme == '':
        return OverridesFile(url.path)
    elif url.scheme == 'file':
        return OverridesFile(url.path)
    elif url.scheme == 'http' or url.scheme == 'https':
        return OverridesUrl(url.geturl())
    elif url.scheme.startswith('git+'):
        if not url.fragment:
            raise UnsupportedUrlError(
                ('Cannot handle overrides with no path given, offeding url was'
                 ' {url}.')
                .format(
                    url=url_string
                )
            )
        fragments = dict(
            map(lambda x: x.split('='), url.fragment.split('&'))
        )
        return OverridesGit(
            repo_url=urldefrag(url.geturl()[4:])[0],
            path=fragments['path'],
            rev=fragments.get('rev', None),
        )
    else:
        raise UnsupportedUrlError('Cannot handle common overrides url %s' %
                                  url_string)
Example #12
0
def oauth_callback():
    if not settings.OAUTH:
        abort(404)

    resp = oauth.provider.authorized_response()
    if resp is None or isinstance(resp, OAuthException):
        log.warning("Failed OAuth: %r", resp)
        return Unauthorized("Authentication has failed.")

    response = signals.handle_oauth_session.send(provider=oauth.provider,
                                                 oauth=resp)
    for (_, role) in response:
        if role is None:
            continue
        update_role(role)
        db.session.commit()
        log.info("Logged in: %r", role)
        authz = Authz.from_role(role)
        token = authz.to_token(role=role)
        token = token.decode('utf-8')
        state = request.args.get('state')
        next_url = get_best_next_url(state, request.referrer)
        next_url, _ = urldefrag(next_url)
        next_url = '%s#token=%s' % (next_url, token)
        return redirect(next_url)

    log.error("No OAuth handler for %r was installed.", oauth.provider.name)
    return Unauthorized("Authentication has failed.")
    def resolving(self, ref: str):
        """
        Context manager which resolves a JSON ``ref`` and enters the
        resolution scope of this ref.
        """
        new_uri = fixed_urljoin(self.resolution_scope, ref)
        uri, fragment = urlparse.urldefrag(new_uri)

        normalized_uri = normalize(uri)
        if normalized_uri in self.store:
            schema = self.store[normalized_uri]
        elif not uri or uri == self.base_uri:
            schema = self.schema
        else:
            schema = resolve_remote(uri, self.handlers)
            if self.cache:
                scheme = urlparse.urlsplit(normalized_uri).scheme
                if scheme != 'internal-no-cache':
                    self.store[normalized_uri] = schema

        old_base_uri, old_schema = self.base_uri, self.schema
        self.base_uri, self.schema = uri, schema
        try:
            with self.in_scope(uri):
                yield resolve_path(schema, fragment)
        finally:
            self.base_uri, self.schema = old_base_uri, old_schema
Example #14
0
 def _GetMeasureDataForSlice(self, slice_id, tableMappings):
     ret = {}
     measures = sorted(
         self.graph.objects(subject=slice_id, predicate=SCHEMA.measure))
     for measure_id in measures:
         unit_codes = list(
             self.graph.objects(subject=measure_id,
                                predicate=SCHEMA.unitCode))
         unit_texts = list(
             self.graph.objects(subject=measure_id,
                                predicate=SCHEMA.unitText))
         csv_id = urldefrag(measure_id).fragment
         for tableMapping in tableMappings:
             if tableMapping['sourceEntity'] == measure_id:
                 csv_id = str(tableMapping['columnIdentifier'])
                 break
         if not csv_id:
             print("Unable to determine CSV ID for metric",
                   measure_id,
                   file=sys.stderr)
             exit(1)
         ret[csv_id] = {
             'id': measure_id,
             'unit_code': unit_codes,
             'unit_text': unit_texts,
         }
     return ret
Example #15
0
    def url(self, name, force=False):
        """
        Returns the real URL in DEBUG mode.
        """
        if settings.DEBUG and not force:
            hashed_name, fragment = name, ''
        else:
            clean_name, fragment = urldefrag(name)
            if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                hashed_name = name
            else:
                cache_key = self.cache_key(name)
                hashed_name = self.cache.get(cache_key)
                if hashed_name is None:
                    hashed_name = self.hashed_name(clean_name).replace('\\', '/')
                    # set the cache if there was a miss
                    # (e.g. if cache server goes down)
                    self.cache.set(cache_key, hashed_name)

        final_url = super(CachedFilesMixin, self).url(hashed_name)

        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        query_fragment = '?#' in name  # [sic!]
        if fragment or query_fragment:
            urlparts = list(urlsplit(final_url))
            if fragment and not urlparts[4]:
                urlparts[4] = fragment
            if query_fragment and not urlparts[3]:
                urlparts[2] += '?'
            final_url = urlunsplit(urlparts)

        return unquote(final_url)
Example #16
0
    def url(self, name, force=False):
        """
        Returns the real URL in DEBUG mode.
        """
        if settings.DEBUG and not force:
            hashed_name, fragment = name, ''
        else:
            clean_name, fragment = urldefrag(name)
            if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                hashed_name = name
            else:
                cache_key = self.cache_key(name)
                hashed_name = self.cache.get(cache_key)
                if hashed_name is None:
                    hashed_name = self.hashed_name(clean_name).replace('\\', '/')
                    # set the cache if there was a miss
                    # (e.g. if cache server goes down)
                    self.cache.set(cache_key, hashed_name)

        final_url = super(CachedFilesMixin, self).url(hashed_name)

        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        query_fragment = '?#' in name  # [sic!]
        if fragment or query_fragment:
            urlparts = list(urlsplit(final_url))
            if fragment and not urlparts[4]:
                urlparts[4] = fragment
            if query_fragment and not urlparts[3]:
                urlparts[2] += '?'
            final_url = urlunsplit(urlparts)

        return unquote(final_url)
Example #17
0
    def _url(self, hashed_name_func, name, force=False, hashed_files=None):
        """
        Return the non-hashed URL in DEBUG mode.
        """
        if settings.DEBUG and not force:
            hashed_name, fragment = name, ''
        else:
            clean_name, fragment = urldefrag(name)
            if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                hashed_name = name
            else:
                args = (clean_name,)
                if hashed_files is not None:
                    args += (hashed_files,)
                hashed_name = hashed_name_func(*args)

        final_url = super().url(hashed_name)

        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        query_fragment = '?#' in name  # [sic!]
        if fragment or query_fragment:
            urlparts = list(urlsplit(final_url))
            if fragment and not urlparts[4]:
                urlparts[4] = fragment
            if query_fragment and not urlparts[3]:
                urlparts[2] += '?'
            final_url = urlunsplit(urlparts)

        return unquote(final_url)
Example #18
0
def normalize(seed_url, link):
    """
    Normalize this URL by removing hash and adding domain
    """
    # remove hash to avoid duplicates
    link, _ = urlparse.urldefrag(link)
    return urlparse.urljoin(seed_url, link)
Example #19
0
 def get_div_link(self, tip):
     tag_a = tip.parent.find('a', class_='qlink')
     if tag_a:
         url = tag_a.get('href')
         return urldefrag(url)[0]
     else:
         return ''
Example #20
0
def getlinks(pageurl, pageresponse, domain):
    """Returns a list of links from from this page to be crawled.
    pageurl = URL of this page
    pageresponse = page content; response object from requests module
    domain = domain being crawled (None to return links to *any* domain)
    """
    soup = bs4.BeautifulSoup(pageresponse.text, "html.parser")

    # get target URLs for all links on the page
    links = [a.attrs.get('href') for a in soup.select('a[href]')]

    # remove fragment identifiers
    links = [urldefrag(link)[0] for link in links]

    # remove any empty strings
    links = [link for link in links if link]

    # if it's a relative link, change to absolute
    links = [link if bool(urlparse(link).netloc) else urljoin(pageurl, link) \
        for link in links]

    # if only crawing a single domain, remove links to other domains
    if domain:
        links = [link for link in links if urlparse(link).netloc == domain]

    return links
Example #21
0
    def resolving(self, ref):
        """
        Context manager which resolves a JSON ``ref`` and enters the
        resolution scope of this ref.

        :argument str ref: reference to resolve

        """

        full_uri = urlparse.urljoin(self.resolution_scope, ref)
        uri, fragment = urlparse.urldefrag(full_uri)

        if uri in self.store:
            document = self.store[uri]
        elif not uri or uri == self.base_uri:
            document = self.referrer
        else:
            document = self.resolve_remote(uri)

        old_base_uri, old_referrer = self.base_uri, self.referrer
        self.base_uri, self.referrer = uri, document
        try:
            with self.in_scope(uri):
                yield self.resolve_fragment(document, fragment)
        finally:
            self.base_uri, self.referrer = old_base_uri, old_referrer
Example #22
0
    def __init__(self, url, previous=None, **info):
        # Apply the simple idempotent optimizations to all urls (no need to
        # ever deal with "HTTP://.."). This means case-sensitivity, and a
        # whole lot of other things that the urlnorm library will do for us.
        # We call this the original url, even though it is a bit of a lie.
        try:
            self.original_url = urlnorm.norm(url)
        except urlnorm.InvalidUrl as e:
            raise urlnorm.InvalidUrl('{}: {}'.format(e, url))

        # For the normalized url that we'll be exposing, remove the
        # fragment, and treat https and http the same.
        url, fragment = urldefrag(self.original_url)
        self.lossy_url_data = {'fragment': fragment}
        if url.startswith('https:'):
            url = 'http' + url[5:]
            self.lossy_url_data.update({'protocol': 'https'})
        self.url = url

        self.set_previous(previous)
        self.info = info
        self.post = None

        # Runtime data
        self.response = None
        self.exception = None
        self.retries = 0
Example #23
0
def escape_ajax(url):
    """
    Return the crawleable url according to:
    https://developers.google.com/webmasters/ajax-crawling/docs/getting-started

    >>> escape_ajax("www.example.com/ajax.html#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
    'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html?#!key=value")
    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
    >>> escape_ajax("www.example.com/ajax.html#!")
    'www.example.com/ajax.html?_escaped_fragment_='

    URLs that are not "AJAX crawlable" (according to Google) returned as-is:

    >>> escape_ajax("www.example.com/ajax.html#key=value")
    'www.example.com/ajax.html#key=value'
    >>> escape_ajax("www.example.com/ajax.html#")
    'www.example.com/ajax.html#'
    >>> escape_ajax("www.example.com/ajax.html")
    'www.example.com/ajax.html'
    """
    defrag, frag = urldefrag(url)
    if not frag.startswith('!'):
        return url
    return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
Example #24
0
    def download_request(self, request):
        from twisted.internet import reactor
        timeout = request.meta.get('download_timeout') or self._connectTimeout
        agent = self._get_agent(request, timeout)

        # request details
        url = urldefrag(request.url)[0]
        method = to_bytes(request.method)
        headers = TxHeaders(request.headers)
        if isinstance(agent, self._TunnelingAgent):
            headers.removeHeader(b'Proxy-Authorization')
        if request.body:
            bodyproducer = _RequestBodyProducer(request.body)
        else:
            bodyproducer = None
        start_time = time()
        d = agent.request(method, to_bytes(url, encoding='ascii'), headers,
                          bodyproducer)
        # set download latency
        d.addCallback(self._cb_latency, request, start_time)
        # response body is ready to be consumed
        d.addCallback(self._cb_bodyready, request)
        d.addCallback(self._cb_bodydone, request, url)
        # check download timeout
        self._timeout_cl = reactor.callLater(timeout, d.cancel)
        d.addBoth(self._cb_timeout, request, url, timeout)
        return d
Example #25
0
def convert_link(base_url, href):
    # check for redirect
    # get content-type
    # compute href file name
    # convert to relative path
    href_url = urljoin(base_url, href)
    href_url_defrag = urldefrag(href_url)[0]
    base_path = url_to_file_name(base_url, 'text/html')  # has to be html
    if href_url_defrag in site_redirects:
        href_url_defrag = site_redirects[href_url_defrag]
    if href_url_defrag in site_urls:
        content_type = site_urls[href_url_defrag]['content-type']
        href_path = url_to_file_name(href_url, content_type)
        href_path = posixpath.relpath(href_path,
                                      start=os.path.dirname(base_path))
        return href_path
    else:
        href_path = url_to_file_name(
            href_url, None)  # allow links not in site_urls if have extension
        if href_path:
            return posixpath.relpath(href_path,
                                     start=os.path.dirname(base_path))
        else:
            print('Unknown URL ' + href_url + ' not in site_urls')
            return None
Example #26
0
def oauth_callback():
    if not settings.OAUTH:
        abort(404)

    resp = oauth.provider.authorized_response()
    if resp is None or isinstance(resp, OAuthException):
        log.warning("Failed OAuth: %r", resp)
        return Unauthorized("Authentication has failed.")

    response = signals.handle_oauth_session.send(provider=oauth.provider,
                                                 oauth=resp)
    for (_, role) in response:
        if role is None:
            continue
        db.session.commit()
        update_role(role)
        log.info("Logged in: %r", role)
        request.authz = Authz.from_role(role)
        record_audit(Audit.ACT_LOGIN)
        token = request.authz.to_token(role=role)
        token = token.decode('utf-8')
        state = request.args.get('state')
        next_url = get_best_next_url(state, request.referrer)
        next_url, _ = urldefrag(next_url)
        next_url = '%s#token=%s' % (next_url, token)
        return redirect(next_url)

    log.error("No OAuth handler for %r was installed.", oauth.provider.name)
    return Unauthorized("Authentication has failed.")
Example #27
0
    def replace_refs(cls, obj, _recursive=False, **kwargs):
        """
        Returns a deep copy of `obj` with all contained JSON reference objects
        replaced with :class:`JsonRef` instances.

        :param obj: If this is a JSON reference object, a :class:`JsonRef`
            instance will be created. If `obj` is not a JSON reference object,
            a deep copy of it will be created with all contained JSON
            reference objects replaced by :class:`JsonRef` instances
        :param base_uri: URI to resolve relative references against
        :param loader: Callable that takes a URI and returns the parsed JSON
            (defaults to global ``jsonloader``, a :class:`JsonLoader` instance)
        :param jsonschema: Flag to turn on `JSON Schema mode
            <http://json-schema.org/latest/json-schema-core.html#anchor25>`_.
            'id' keyword changes the `base_uri` for references contained within
            the object
        :param load_on_repr: If set to ``False``, :func:`repr` call on a
            :class:`JsonRef` object will not cause the reference to be loaded
            if it hasn't already. (defaults to ``True``)

        """

        store = kwargs.setdefault("_store", _URIDict())
        base_uri, frag = urlparse.urldefrag(kwargs.get("base_uri", ""))
        store_uri = None  # If this does not get set, we won't store the result
        if not frag and not _recursive:
            store_uri = base_uri
        try:
            if kwargs.get("jsonschema") and isinstance(obj["id"], basestring):
                kwargs["base_uri"] = urlparse.urljoin(
                    kwargs.get("base_uri", ""), obj["id"]
                )
                store_uri = kwargs["base_uri"]
        except (TypeError, LookupError):
            pass

        try:
            if not isinstance(obj["$ref"], basestring):
                raise TypeError
        except (TypeError, LookupError):
            pass
        else:
            return cls(obj, **kwargs)

        # If our obj was not a json reference object, iterate through it,
        # replacing children with JsonRefs
        kwargs["_recursive"] = True
        path = list(kwargs.pop("_path", ()))
        if isinstance(obj, Mapping):
            obj = type(obj)(
                (k, cls.replace_refs(v, _path=path+[k], **kwargs))
                for k, v in iteritems(obj)
            )
        elif isinstance(obj, Sequence) and not isinstance(obj, basestring):
            obj = type(obj)(
                cls.replace_refs(v, _path=path+[i], **kwargs) for i, v in enumerate(obj)
            )
        if store_uri is not None:
            store[store_uri] = obj
        return obj
Example #28
0
 def _ExpandFootnotes(self):
     for result in self.graph.query(
             MakeSparqlSelectQuery(
                 ('?ds', 'a', 'schema:StatisticalDataset'),
                 ('?ds', 'schema:footnote', '?fn'),
                 ns_manager=self.graph.namespace_manager)):
         if result['fn'] not in self.subjects:
             self.graph.remove(
                 (result['ds'], SCHEMA.footnote, result['fn']))
             id_prefix = urldefrag(str(result['ds'])).url
             with self.getter.Fetch(str(result['fn'])) as f:
                 reader = DictReader(f)
                 for row in reader:
                     row_id = rdflib.URIRef(id_prefix + '#footnote=' +
                                            row['codeValue'])
                     self.graph.add((result['ds'], SCHEMA.footnote, row_id))
                     self.graph.add((row_id, rdflib.RDF.type,
                                     SCHEMA.StatisticalAnnotation))
                     for key, val in row.items():
                         fields = key.split('@')
                         if len(fields) > 1:
                             # A language code is specified
                             self.graph.add(
                                 (row_id, getattr(SCHEMA, fields[0]),
                                  rdflib.Literal(val, language=fields[1])))
                         else:
                             self.graph.add((row_id, getattr(SCHEMA, key),
                                             rdflib.Literal(val)))
Example #29
0
def getlinks(pageurl, domain, soup):
    """Returns a list of links from from this page to be crawled.

    pageurl = URL of this page
    domain = domain being crawled (None to return links to *any* domain)
    soup = BeautifulSoup object for this page
    """

    # get target URLs for all links on the page
    links = [a.attrs.get('href') for a in soup.select('a[href]')]

    # remove fragment identifiers
    links = [urldefrag(link)[0] for link in links]

    # remove any empty strings
    links = [link for link in links if link]

    # if it's a relative link, change to absolute
    links = [link if bool(urlparse(link).netloc) else urljoin(pageurl, link) \
        for link in links]

    # if only crawing a single domain, remove links to other domains
    if domain:
        links = [link for link in links if samedomain(urlparse(link).netloc, domain)]

    return links
def validate(url):
  if url in visitedUrls: return

  visitedUrls.append(url)
  try:
    content = urlopen(url).read().decode("utf8")
  except:
    # Assume the content is binary.
    return

  wikiUrls = []
  invalidUrls = []
  # This may see redundant, but without the `.find_all('a')`, soup will also
  # contain the `DocType` element which does not have an `href` attribute.
  # See <http://stackoverflow.com/questions/17943992/beautifulsoup-and-soupstrainer-for-getting-links-dont-work-with-hasattr-returni>.
  soup = BeautifulSoup(content, parse_only=SoupStrainer('a', href=True)).find_all('a')
  for externalUrl in soup:
    fullExternalUrl = urljoin(url, urldefrag(externalUrl['href']).url)
    if baseUrl in fullExternalUrl and \
        not fullExternalUrl.endswith('/_history'):
      if externalUrl.has_attr('class') and 'absent' in externalUrl['class']:
        invalidUrls.append(fullExternalUrl)
      else:
        wikiUrls.append(fullExternalUrl)

  if len(invalidUrls) > 0:
    invalidWikiPages.append((url, invalidUrls))

  for wikiUrl in wikiUrls:
    if wikiUrl not in visitedUrls:
      validate(wikiUrl)
Example #31
0
    def _url(self, hashed_name_func, name, force=False, hashed_files=None):
        """
        Return the non-hashed URL in DEBUG mode.
        """
        if settings.DEBUG and not force:
            hashed_name, fragment = name, ''
        else:
            clean_name, fragment = urldefrag(name)
            if urlsplit(clean_name).path.endswith('/'):  # don't hash paths
                hashed_name = name
            else:
                args = (clean_name,)
                if hashed_files is not None:
                    args += (hashed_files,)
                hashed_name = hashed_name_func(*args)

        final_url = super().url(hashed_name)

        # Special casing for a @font-face hack, like url(myfont.eot?#iefix")
        # http://www.fontspring.com/blog/the-new-bulletproof-font-face-syntax
        query_fragment = '?#' in name  # [sic!]
        if fragment or query_fragment:
            urlparts = list(urlsplit(final_url))
            if fragment and not urlparts[4]:
                urlparts[4] = fragment
            if query_fragment and not urlparts[3]:
                urlparts[2] += '?'
            final_url = urlunsplit(urlparts)

        return unquote(final_url)
Example #32
0
def validate_url(url, parent_url='http:'):

    """
    Validate a URL to be a string having an explicit recognized scheme.

    Arguments:
        url: string URL
        parent_url: optional string URL from which to inherit an implicit
                    scheme.

    Returns: dict having:
        valid: boolean truth value.
        url: string modified URL.
    """

    if bytes == type(url):
        url = url.decode()

    parsed_url = urlparse(url)

    if 0 < len(parsed_url.path) and '/' == parsed_url.path[0]:
        url = urldefrag(urljoin(parent_url, url))[0]

    elif not parsed_url.scheme:
        parent_scheme = urlparse(parent_url).scheme or 'http'
        url = parent_scheme + ':' + url

    parsed_url = urlparse(url)

    valid = parsed_url.scheme in ('http', 'https', '') and \
            bool(parsed_url.netloc)

    return {'valid': valid, 'url': url}
    def _parse(self, page: BeautifulSoup, url):
        seasons = OrderedDict()
        eqg = OrderedSet()

        child = page.select_one("#WikiaArticle h2")
        season = child.text

        while child.next_sibling:
            child = child.next_sibling

            if child.name == "table":
                for a in child.find_all("a", string="Transcript"):
                    if not a.has_attr("class") or "new" not in a["class"]:
                        episode_url, fragment = urldefrag(a["href"])
                        episode_url = urljoin(url, episode_url)
                        if "Equestria Girls" not in season:
                            if season not in seasons:
                                seasons[season] = OrderedSet()
                            seasons[season].append(episode_url)
                        else:
                            eqg.append(episode_url)
                continue

            if child.name == "h2":
                season = child.text
                continue

        seasons["Equestria Girls"] = eqg
        return seasons
Example #34
0
 def _GetDimensionDataForSlice(self, slice_id, tableMappings):
     ret = {}
     dims = sorted(
         self.graph.objects(subject=slice_id, predicate=SCHEMA.dimension))
     for dim_id in dims:
         dim_type = list(
             self.graph.objects(subject=dim_id, predicate=rdflib.RDF.type))
         dim_equiv_types = list(
             self.graph.objects(subject=dim_id,
                                predicate=SCHEMA.equivalentType))
         csv_id = urldefrag(dim_id).fragment
         for tableMapping in tableMappings:
             if tableMapping['sourceEntity'] == dim_id:
                 csv_id = str(tableMapping['columnIdentifier'])
                 break
         if not csv_id:
             print("Unable to determine CSV ID for dimension",
                   dim_id,
                   file=sys.stderr)
             exit(1)
         ret[csv_id] = {
             'id': dim_id,
             'type': dim_type,
             'types': dim_equiv_types
         }
     return ret
Example #35
0
def has_valid_signature(request, activity):
    ''' verify incoming signature '''
    try:
        signature = Signature.parse(request)

        key_actor = urldefrag(signature.key_id).url
        if key_actor != activity.get('actor'):
            raise ValueError("Wrong actor created signature.")

        remote_user = activitypub.resolve_remote_id(models.User, key_actor)
        if not remote_user:
            return False

        try:
            signature.verify(remote_user.key_pair.public_key, request)
        except ValueError:
            old_key = remote_user.key_pair.public_key
            remote_user = activitypub.resolve_remote_id(models.User,
                                                        remote_user.remote_id,
                                                        refresh=True)
            if remote_user.key_pair.public_key == old_key:
                raise  # Key unchanged.
            signature.verify(remote_user.key_pair.public_key, request)
    except (ValueError, requests.exceptions.HTTPError):
        return False
    return True
Example #36
0
    async def read_file(self, parser_server_queue, mongodb):

        # reads one document from the database at random.
        # if error is raised, returns.
        print(
            "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF"
        )
        try:
            cursor = mongodb.fetch_random_doc('https3', 1)
            for docs in cursor:
                doc = docs['file_data']
                ids = docs['_id']
            mongodb.update_doc('https3', ids)

        except Exception as e:
            return "Kill the Task"

        # extract_tags return list of all 'a' tags and url of the page parsed.
        # base_url ====> page url, a_tags ====> list of 'a' tags.
        base_url, a_tags = await self.extract_tags(doc)

        if base_url == "":
            return

        for url in a_tags:

            urls = url.get("href")
            if urls == None:
                return

            # check if link retrieved is in absolute form.
            # If False, convert to absolute and then check the structure of url formed.
            # If True, directly check the structure of url formed.
            # While checking structure, anything beyond and including [#,?,=,:] must be dropped.

            if self.is_absolute(urls):
                urls = await self.correct_url_structure(urls)
            else:
                url_absolute = urljoin(base_url, urls)
                print(
                    " A  B  S  O  L  U  T  E   U  R  L  ", base_url,
                    "          >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
                )
                url_absolute = urldefrag(url_absolute)
                urls = await self.correct_url_structure(url_absolute[0])

            urls = unquote(urls)

            print()
            print()
            print(";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;  ",
                  urls, ";;;;;;;;;;;;;;;;;;;;;;;;;;;;;")

            # put in the parser_server_queue for server to consume.
            await parser_server_queue.put(urls)
            print(
                "EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE"
            )
        return "DONE"
Example #37
0
    async def _get_data_from_page(
            self, page_tld: str, host: str,
            page_domain: str) -> Tuple[Set, Set, Set, Set]:
        urls = set()
        domains = set()
        backlinks = set()
        redirects = set()

        for link in set(self.html_page.xpath('//a')):
            if tldextract.extract(page_domain)[2] != page_tld:
                continue

            # reset to default
            href = url = link_url = link_tld = internal_link = ''

            try:
                href = link.xpath('./@href')[0].lower()
            except IndexError:
                continue

            url = unquote(urldefrag(href)[0]).replace("'", "")

            url = url.split('@')[-1:][0]  # remove mailto part

            if url == page_domain:  # skip this option
                continue

            if len(url) > len(page_tld) + 1:

                link_url = tldextract.extract(url)
                if link_url[1] in ['mailto', 'google']:
                    continue

                link_tld = link_url[2]

                if not link_tld:
                    # url has not tld - it's URI, then create full URL
                    url = url[1:] if url.startswith('./') else url
                    url = url[1:] if url.startswith('//') else url
                    delimiter = '' if url.startswith('/') else '/'
                    internal_link = f"{page_domain}{delimiter}{url}"

                if link_tld == page_tld:
                    # if url not domain:
                    try:
                        if len(url.split(f'.{page_tld}')[1]) > 1:
                            urls.add(url)
                    except IndexError:
                        pass
                    if host not in url and url not in host:
                        anchor = await self._get_anchor(link)
                        dofollow = await self._get_rel(link)
                        backlinks.add((url[:249], anchor, dofollow))
                        domain = '.'.join(link_url[1:3])
                        if len(domain) < 64:  # check for maximum domain length
                            domains.add(domain)
                elif internal_link:
                    urls.add(internal_link)
        return domains, urls, backlinks, redirects
Example #38
0
    async def parse_links(self, response):
        links = set()
        content_type = None
        encoding = None
        price_link = []
        if response.status == 200:
            content_type = response.headers.get('content-type')
            response_url = str(response.url)
            if content_type in ('text/html', 'application/xml',
                                'text/html;charset=UTF-8'):
                pdict = {}
                if content_type:
                    content_type, pdict = cgi.parse_header(content_type)

                encoding = pdict.get('charset', 'utf-8')
                if content_type in ('text/html', 'application/xml'):
                    text = await response.text()
                    # print(text)
                    '''(?i)href=["']([^\s"'<>]+)'''
                    urls = set(
                        re.findall(
                            '<li style="margin-left: [-\d]+px">.*?<a href="(/s/ref=lp_\d+_nr_n_[\d+].*?)">.*?<span class="refinementLink">(.*?)</span>.*?</a>.*?</li>',
                            text, re.S | re.M))

                    if urls:
                        LOGGER.info('got %r distinct urls from %r', len(urls),
                                    response.url)
                    else:
                        for price_g in range(1, 100, 2):
                            low_price = price_g
                            high_price = price_g + 1
                            price_link.append(
                                "{}&low-price={}&high-price={}".format(
                                    response_url, low_price, high_price))

                    if len(price_link) > 0:
                        redis_server.lpush("price_link_tmp", *price_link)

                    for url in urls:
                        u, t = url
                        k = u.replace('&amp;', '&')
                        normalized = urljoin(str(response.url), k)
                        defragmented, frag = urldefrag(normalized)
                        if self.url_allowed(defragmented):
                            print(defragmented, t)
                            ''' Children's Books(儿童图书) General (科学通俗读物) 这两个陷入了回调.
                                INFO:__main__:redirect to 'https://www.amazon.cn/s/ref=lp_2084813051_nr_n_11/460-8646033-3118437?rh=n%3A2084813051&ie=UTF8' from 
                                'https://www.amazon.cn/s/ref=lp_2084813051_nr_n_11/460-8646033-3118437?fst=as%3Aoff&rh=n%3A658390051%2Cn%3A%21658391051%2Cn%3A2045366051%2Cn%3A2078652051%2Cn%3A2084813051%2Cn%3A2084839051&bbn=2084813051&ie=UTF8&qid=1511710241&rnid=2084813051'
                            '''
                            LOGGER.info = LOGGER.debug(
                                'previous url: %s, next url: %s, title: %s',
                                str(response.url), defragmented, t)
                            if t == "General (科学通俗读物)":
                                LOGGER.error("错误的分类: %r", t)
                            else:
                                links.add(defragmented)

        stat = FetchStatistic(url=response.url)
        return stat, links
Example #39
0
def splitDecodeFragment(url):
    urlPart, fragPart = urldefrag(url)
    if isPy3:
        return (urlPart, unquote(fragPart, "utf-8", errors=None))
    else:
        return _STR_UNICODE(urlPart), unquote(_STR_UNICODE(fragPart),
                                              "utf-8",
                                              errors=None)
Example #40
0
def defragment_and_absolute(current_url, new_url):
    parse_new_url = urlparse(new_url)
    if not bool(parse_new_url.netloc):
        # if new url is a relative link, make it absolute with the current url
        new_url = urljoin(current_url, new_url)
        parse_new_url = urlparse(new_url)
    defrag = urldefrag(new_url)
    return defrag.url
Example #41
0
def splitDecodeFragment(url):
    if url is None: # urldefrag returns byte strings for none, instead of unicode strings
        return _STR_UNICODE(""), _STR_UNICODE("")
    urlPart, fragPart = urldefrag(url)
    if isPy3:
        return (urlPart, unquote(fragPart, "utf-8", errors=None))
    else:
        return _STR_UNICODE(urlPart), unquote(_STR_UNICODE(fragPart), "utf-8", errors=None)
Example #42
0
def validate(uri):
    sep = ' '
    uri = uri.split(sep, 1)[0]
    u = urlparse(uri)
    if not u.scheme or not u.netloc:
        return False
    fixed, throwaway = urldefrag(uri)
    return fixed
Example #43
0
def _get_repo_remote(url):
    '''
    For a given git url, return the base url and branch, if present (otherwise
    return 'master').
    '''
    base_url, branch = urldefrag(url)
    branch = branch or 'master'
    return base_url, branch
Example #44
0
def _meta_schemas():
    """
    Collect the urls and meta schemas from each known validator.

    """

    meta_schemas = (v.META_SCHEMA for v in validators.values())
    return dict((urlparse.urldefrag(m["id"])[0], m) for m in meta_schemas)
Example #45
0
def _meta_schemas():
    """
    Collect the urls and meta schemas from each known validator.

    """

    meta_schemas = (v.META_SCHEMA for v in validators.values())
    return dict((urlparse.urldefrag(m["id"])[0], m) for m in meta_schemas)
Example #46
0
def splitDecodeFragment(url):
    if url is None: # urldefrag returns byte strings for none, instead of unicode strings
        return _STR_UNICODE(""), _STR_UNICODE("")
    urlPart, fragPart = urldefrag(url)
    if isPy3:
        return (urlPart, unquote(fragPart, "utf-8", errors=None))
    else:
        return _STR_UNICODE(urlPart), unquote(_STR_UNICODE(fragPart), "utf-8", errors=None)
Example #47
0
def test_spider(client, app, check_external_links):
    """Check that all links work

    Spiders the site, making sure all internal links point to existing pages.
    Includes fragments: any #hash in a link must correspond to existing element
    with id.

    If check_external_links is true, checks external links as well.
    """
    to_visit = {'http://localhost/'}
    visited = set()
    external = set()

    wanted_fragments = collections.defaultdict(set)
    page_ids = {}

    def recording_url_for(*args, **kwargs):
        url = flask.url_for(*args, **kwargs)
        if url not in visited:
            to_visit.add(urljoin('http://localhost/', url))
        return url

    app.jinja_env.globals['url_for'] = recording_url_for

    while to_visit:
        url = to_visit.pop()
        if url in visited:
            continue
        visited.add(url)
        links = []
        parsed = urlparse(url)
        if parsed.netloc == 'localhost':
            print('visit', url)
            page_ids[url] = []
            check_url(client, url, links, page_ids[url])
            for link in links:
                fullurl = urljoin('http://localhost/', url)
                fullurl = urljoin(fullurl, link)
                result = urldefrag(fullurl)
                defrag = result.url
                fragment = result.fragment
                if fragment:
                    wanted_fragments[defrag].add(fragment)
                if defrag not in visited:
                    to_visit.add(defrag)
        else:
            if parsed.scheme in ('http', 'https'):
                external.add(url)
            else:
                print('ignore', url)

    for url, fragments in wanted_fragments.items():
        assert fragments <= set(page_ids[url])

    if check_external_links:
        for url in external:
            print('check', url)
            check_external_link(url)
Example #48
0
 def absolutize(self, uri, defrag=1):
     base = urljoin("file:", pathname2url(os.getcwd()))
     result = urljoin("%s/" % base, uri, allow_fragments=not defrag)
     if defrag:
         result = urldefrag(result)[0]
     if not defrag:
         if uri and uri[-1] == "#" and result[-1] != "#":
             result = "%s#" % result
     return URIRef(result)
def html_to_lxml(url, text, clean=False):
    """Parse plain-text HTML into an `lxml` tree."""
    if clean:
        text = _text_from_sp(('pandoc', '--from=html', '--to=html5'),
                             text.encode())
    html = lxml.html.document_fromstring(text)
    # Endless loops ahoy
    html.rewrite_links(lambda s: '' if urldefrag(s).url == url else s,
                       base_href=url)
    return html
Example #50
0
def check_url(url):
    """Check the given URL by issuring a HEAD request."""
    # We don't want to include a fragment in our request.
    url, fragment = urldefrag(url)

    # Attempt to open the target URL using a HEAD request.
    request = Request(url)
    request.get_method = lambda: 'HEAD'
    request.add_header('User-Agent', USER_AGENT)

    return urlopen(request)
Example #51
0
def crawler(startpage, maxpages=100, singledomain=True):
    """Crawl the web starting from specified page.

    1st parameter = starting page url
    maxpages = maximum number of pages to crawl
    singledomain = whether to only crawl links within startpage's domain
    """
    import requests, re, bs4
    from urllib.parse import urldefrag, urljoin, urlparse
    from collections import deque

    pagequeue = deque() # queue of pages to be crawled
    pagequeue.append(startpage)
    crawled = [] # list of pages already crawled
    domain = urlparse(startpage).netloc # for singledomain option

    pages = 0 # number of pages succesfully crawled so far
    failed = 0 # number of pages that couldn't be crawled

    while pages < maxpages and pagequeue:
        url = pagequeue.popleft() # get next page to crawl (FIFO queue)
        try:
            response = requests.get(url)
            if not response.headers['content-type'].startswith('text/html'):
                continue # don't crawl non-HTML links
            soup = bs4.BeautifulSoup(response.text, "html.parser")
            print('Crawling:', url)
            pages += 1
            crawled.append(url)

            # PROCESSING CODE GOES HERE:
            # do something interesting with this page

            # get target URLs for all links on the page
            links = [a.attrs.get('href') for a in soup.select('a[href]')]
            # remove fragment identifiers
            links = [urldefrag(link)[0] for link in links]
            # remove any empty strings
            links = list(filter(None,links))
            # if it's a relative link, change to absolute
            links = [link if bool(urlparse(link).netloc) else urljoin(url,link) for link in links]
            # if singledomain=True, remove links to other domains
            if singledomain:
                links = [link for link in links if (urlparse(link).netloc == domain)]

            # add these links to the queue (except if already crawled)
            for link in links:
                if link not in crawled and link not in pagequeue:
                    pagequeue.append(link)
        except:
            print("*FAILED*:", url)
            failed += 1

    print('{0} pages crawled, {1} pages failed to load.'.format(pages, failed))
Example #52
0
 def send(self, request, stream=False, timeout=None, verify=True, cert=None, proxies=None):
     url, fragment = urldefrag(request.url)
     self.requests.update({url: 1})
     if not url in self.urls:
         raise ConnectionError("no such virtual url", request.url)
     resp = Resp(**self.urls[url])
     r = self.build_response(request, resp)
     if not stream:
         # force prefetching content unless streaming in use
         r.content
     return r
Example #53
0
 def extract_links(self, html):
     log.debug("extract_links")
     soup = BeautifulSoup(html, "lxml")
     links = []
     for link in soup.find_all('a', href=True):
         href = link.get('href')
         href = urljoin(self.seed, href)
         href, _ = urldefrag(href)
         links.append(href)
     log.info("found {}".format(len(links)))
     return links
Example #54
0
def filter_links(link_list, base_url):
    href_list = [element.get("href") for element in link_list]
    for i in range(len(href_list)):
        if href_list[i].startswith("http://go.usa.gov/"):
            href_list[i] = utils.resolve_redirect(href_list[i])
    href_list = [urldefrag(urljoin(base_url, href))[0] for href in href_list]
    filtered_list = [
        href for href in href_list if href and href not in BLACKLIST_REPORT_URLS and not href.startswith("mailto:")
    ]
    filtered_list = list(set(filtered_list))
    return filtered_list
Example #55
0
    def annotation(self):
        settings = self.request.registry.settings

        try:
            document = util.elasticsearch_client(settings).get(
                index=settings["elasticsearch_index"],
                doc_type="annotation",
                id=self.request.matchdict["id"])
        except exceptions.NotFoundError:
            statsd.incr("views.annotation.404.annotation_not_found")
            raise httpexceptions.HTTPNotFound(_("Annotation not found"))

        try:
            annotation_id, document_uri = util.parse_document(document)
        except util.InvalidAnnotationError as exc:
            statsd.incr("views.annotation.422.{}".format(exc.reason))
            raise httpexceptions.HTTPUnprocessableEntity(str(exc))

        # Remove any existing #fragment identifier from the URI before we
        # append our own.
        document_uri = parse.urldefrag(document_uri)[0]

        if not (document_uri.startswith("http://") or
                document_uri.startswith("https://")):
            statsd.incr("views.annotation.422.not_an_http_or_https_document")
            raise httpexceptions.HTTPUnprocessableEntity(
                _("Sorry, but it looks like this annotation was made on a "
                  "document that is not publicly available."))

        via_url = "{via_base_url}/{uri}#annotations:{id}".format(
            via_base_url=settings["via_base_url"],
            uri=document_uri,
            id=annotation_id)

        extension_url = "{uri}#annotations:{id}".format(
            uri=document_uri, id=annotation_id)

        parsed_url = parse.urlparse(document_uri)
        pretty_url = parsed_url.netloc[:NETLOC_MAX_LENGTH]
        if len(parsed_url.netloc) > NETLOC_MAX_LENGTH:
          pretty_url = pretty_url + jinja2.Markup("&hellip;")

        statsd.incr("views.annotation.200.annotation_found")
        return {
            "data": json.dumps({
                # Warning: variable names change from python_style to
                # javaScriptStyle here!
                "chromeExtensionId": settings["chrome_extension_id"],
                "viaUrl": via_url,
                "extensionUrl": extension_url,
            }),
            "pretty_url": pretty_url
        }
Example #56
0
 def get_urls(self, document):
     urls = []
     urls_to_parse = []
     dom = html.fromstring(document)
     for href in dom.xpath('//a/@href'):
         if any(e in href for e in self.exclude):
                 continue
         url = urljoin(self.base, urldefrag(href)[0])
         if url.startswith(self.base):
             if self.capture in url:
                 urls_to_parse.append(url)
             urls.append(url)
     return urls, urls_to_parse
Example #57
0
    def fix_toc_entry(self, toc):
        if toc.href:
            href = urlnormalize(toc.href)
            href, frag = urldefrag(href)
            replacement = self.rename_map.get(href, None)

            if replacement is not None:
                nhref = replacement
                if frag:
                    nhref = '#'.join((nhref, frag))
                toc.href = nhref

        for x in toc:
            self.fix_toc_entry(x)
Example #58
0
    def get_out_going_links(self, page, html_body):
        """extracts all the outgoing links and adds links that belong to
           main page domain for further crawling if they are not crawled yet
           This avoids:
            - links that are .zip files
            - links mentioned in href that are javascript methods
            - mailto: links

        """
        soup = BeautifulSoup(html_body, "html.parser")
        valid_links_for_this_page = []
        for a in soup.find_all('a', href=True):

            href = a['href'].lower()
            href = self.compose_url_from_href(page, href)

            # clean the href so that it will have legitimate urls instead of #cluttered ones and q=param prints
            href = urldefrag(href)[0]  # skip intra links [this took time to find out !] ##1
            # remove query params as only the path matters
            if href.find('?') != -1:
                href = href[:href.find('?')]  ##2

            new_page = urlparse(href)

            # add to the queue only it it doesn't cause a cycle
            # assumption: if a link ends with domain.com, assuming it can be crawled to make sitemap complete
            if  not str(new_page.netloc).endswith(self.start_page):          # doesn't belong to domain
                valid_links_for_this_page.append(href)
                continue

            if  self.robot_allows(href) and \
                not href in self.site_map            and \
                not href in self.unvisited                  and \
                not 'javascript:' in href           and \
                not 'mailto:' in href:
                if not ( href.endswith(".zip") or
                             href.endswith(".gz") or
                             href.endswith(".gzip") or
                             href.endswith(".tar") or
                             href.endswith(".bz2") or
                             href.endswith(".jpg") or
                             href.endswith(".png") or
                             href.endswith(".exe")
                         ):
                    self.unvisited.add(href)
                valid_links_for_this_page.append(href)

        return valid_links_for_this_page
Example #59
0
def crawl(base_url, follow_external_links=True, ignore_fragments=True,
          verify=True):
    base_netloc = urlparse(base_url).netloc

    seen = set([base_url])
    todo = [base_url]

    session = requests.Session()
    session.verify = verify

    while todo:
        url = todo.pop()

        try:
            rsp = session.get(url)
        except requests.exceptions.InvalidSchema:
            # TODO: Check if the scheme is a valid one, or otherwise
            # communicate the error to the user.
            continue

        yield rsp

        if urlparse(url).netloc != base_netloc:
            continue

        content_type, _ = cgi.parse_header(rsp.headers['content-type'])

        if content_type == 'text/html':
            urls = extract_urls_from_html(rsp.text)
        elif content_type == 'text/css':
            urls = extract_urls_from_css(rsp.text)
        else:
            # see https://bitbucket.org/ned/coveragepy/issues/497/
            continue  # pragma: no cover

        for url1 in urls:
            abs_url = urljoin(url, url1)

            if ignore_fragments:
                abs_url = urldefrag(abs_url)[0]

            if not follow_external_links:
                if urlparse(abs_url).netloc != base_netloc:
                    continue

            if abs_url not in seen:
                seen.add(abs_url)
                todo.append(abs_url)
Example #60
0
 def find_internal_links(route):
   """
   Given a route, returns (an estimate of) all routes that
   it links to in any manner (href or resource).  Won't find links
   that are neither in an "href" nor marked as a rewritable-resource link
   (erring on the side of not finding links since finding links risks
   half-private pages being Google-indexable, while finding no links
   in weird cases just means weird enough things won't be Google-indexable
   unless I explicitly mark them into that list).
   """
   result = set()
   f = route_metadata[route].file
   # TODO if we except some HTML files from rewriting then this
   # will be wrong:
   if f not in files_to_rewrite:
     return result
   contents = utils.read_file_binary(join('site', f))
   for href in re.finditer(
       br'(?<!'+urlregexps.urlbyte+br')(?:'+
       # Things that look like href= that are not a link to a page:
       # example text that talks about hrefs; <base href="...">.
       # Only <a>, <area> and possibly <link> elements can have
       # hrefs we're interested in.  By excluding not-actually-links,
       # we can have broken-link detection without false positives
       # (or few enough false positives that it's trivial to work around).
       br'''<(?:[Aa]|[Aa][Rr][Ee][Aa]|[Ll][Ii][Nn][Kk])\s[^<>]*href=(?P<quote>["']?)(?P<url1>'''+urlregexps.urlbyte+br'''+)(?<!\?rr)(?P=quote)'''+
       br'''|(?P<url2>'''+urlregexps.urlbyte+br'''+)\?rr'''+
       br')(?!'+urlregexps.urlbyte+br')'
       ,
       contents):
     url = href.group('url1') or href.group('url2')
     linktype = 'rr' if href.group('url2') != None else 'href'
     ref = html.unescape(url.decode('utf-8'))
     if linktype == 'rr':
       path = config.fake_resource_route+normpath(join(dirname(f), ref))
     elif linktype == 'href':
       path = urldefrag(urljoin(route, ref))[0]
     if path in route_metadata:
       result.add(path)
     else:
       # (we don't currently try to check links to third-party websites)
       if path[:len(config.hypothetical_scheme_and_domain)] == config.hypothetical_scheme_and_domain:
         sys.stderr.write(route + ' links to nonexistent ' + ref + '\n')
         nonlocal broken_link_found
         broken_link_found = True
   return result