Esempio n. 1
0
    def _build_adjacency_graph(self, entries, domains):
        """
        Constructs adjacency graph for outgoing links, saves to self.adj_graph

        todo: mv ES entries/documents parsing in separate function?

        :param entries: An iterable that contains the ES entries.
            Preferably a generator to reduce RAM usage.
        :param domains: If non-empty then inspect only the links that point
            to webpages under these domains
        :return: adjacency matrix
        :rtype: ``list`` of tuples
        """
        entries = entries or self.entries
        adj_graph = []

        for e in entries:
            if '_source' in e:
                # if called by `calc_page_pop.py` then `e` is an ES document
                e = e['_source']

            if 'domain' in e:
                # entry from crawled page
                origin = e['domain']
            elif 'source' in e:
                # entry from anchor text
                origin = e['source']
            else:
                logger.warning('rank_pages: Unable to process: %s' % e)
                continue

            origin = utils.extract_domain_from_url(origin)
            if is_valid_onion(origin):
                origin_idx = self._get_domain_idx(origin)

                links = e.get('links', [])  # crawled case
                if 'target' in e:  # anchor case
                    links.append({'link': e['target']})

                for l in links:
                    # ignore SE-spam links
                    if is_legit_link(l, e):
                        url = l['link']
                        if is_valid_full_onion_url(url):
                            destiny = utils.extract_domain_from_url(url)
                            # if domains non-empty ignore any other origins
                            if not domains or destiny in domains:
                                if destiny != origin:
                                    destiny_idx = self._get_domain_idx(destiny)
                                    adj_graph.append((origin_idx, destiny_idx))

        self.num_links = len(adj_graph)  # total links
        adj_graph = set(adj_graph)  # count only 1 edge of source->destiny
        self.num_edges = len(adj_graph)  # unique links

        return adj_graph
Esempio n. 2
0
    def _build_adjacency_graph(self, entries, domains):
        """
        Constructs adjacency graph for outgoing links, saves to self.adj_graph

        todo: mv ES entries/documents parsing in separate function?

        :param entries: An iterable that contains the ES entries.
            Preferably a generator to reduce RAM usage.
        :param domains: If non-empty then inspect only the links that point
            to webpages under these domains
        :return: adjacency matrix
        :rtype: ``list`` of tuples
        """
        entries = entries or self.entries
        adj_graph = []

        for e in entries:
            if '_source' in e:
                # if called by `calc_page_pop.py` then `e` is an ES document
                e = e['_source']

            if 'domain' in e:
                # entry from crawled page
                origin = e['domain']
            elif 'source' in e:
                # entry from anchor text
                origin = e['source']
            else:
                logger.warning('rank_pages: Unable to process: %s' % e)
                continue

            origin = utils.extract_domain_from_url(origin)
            if is_valid_onion(origin):
                origin_idx = self._get_domain_idx(origin)

                links = e.get('links', [])  # crawled case
                if 'target' in e:  # anchor case
                    links.append({'link': e['target']})

                for l in links:
                    # ignore SE-spam links
                    if is_legit_link(l, e):
                        url = l['link']
                        if is_valid_full_onion_url(url):
                            destiny = utils.extract_domain_from_url(url)
                            # if domains non-empty ignore any other origins
                            if not domains or destiny in domains:
                                if destiny != origin:
                                    destiny_idx = self._get_domain_idx(destiny)
                                    adj_graph.append((origin_idx, destiny_idx))

        self.num_links = len(adj_graph)  # total links
        adj_graph = set(adj_graph)  # count only 1 edge of source->destiny
        self.num_edges = len(adj_graph)  # unique links

        return adj_graph
Esempio n. 3
0
def onion_redirect(request):
    """Add clicked information and redirect to .onion address."""

    redirect_url = request.GET.get('redirect_url', '')
    search_term = request.GET.get('search_term', '')

    #Checks for "malicious" URI-schemes that could lead to XSS
    #Malicious user is redirected on a 403 error page
    #if the previous checks replace a malicious URI-scheme
    if not xss_safe(redirect_url):
        answer = "Bad request: undefined URI-scheme provided"
        return HttpResponseBadRequest(answer)

    if not redirect_url or not search_term:
        answer = "Bad request: no GET parameter URL."
        return HttpResponseBadRequest(answer)

    try:
        onion = utils.extract_domain_from_url(redirect_url)
        if is_valid_full_onion_url(redirect_url):
            # currently we can't log i2p clicks due to
            # SearchResultsClick.onion_domain having an onion validator
            # Also we don't have yet i2p results in order to test it
            SearchResultsClick.objects.add_or_increment(
                onion_domain=onion,
                clicked=redirect_url,
                search_term=search_term)
    except Exception as error:
        logger.error("Error with redirect URL: {0}\n{1}".format(
            redirect_url, error))

    message = "Redirecting to hidden service."
    return redirect_page(message, 0, redirect_url)
Esempio n. 4
0
def onion_redirect(request):
    """Add clicked information and redirect to .onion address."""

    redirect_url = request.GET.get('redirect_url', '')
    search_term = request.GET.get('search_term', '')

    if not redirect_url or not search_term:
        answer = "Bad request: no GET parameter URL."
        return HttpResponseBadRequest(answer)

    try:
        onion = utils.extract_domain_from_url(redirect_url)
        if is_valid_full_onion_url(redirect_url):
            # currently we can't log i2p clicks due to
            # SearchResultsClick.onion_domain having an onion validator
            # Also we don't have yet i2p results in order to test it
            SearchResultsClick.objects.add_or_increment(
                onion_domain=onion,
                clicked=redirect_url,
                search_term=search_term)
    except Exception as error:
        logger.error("Error with redirect URL: {0}\n{1}".format(
            redirect_url, error))

    message = "Redirecting to hidden service."
    return redirect_page(message, 0, redirect_url)
Esempio n. 5
0
def onion_redirect(request):
    """Add clicked information and redirect to .onion address."""

    redirect_url = request.GET.get('redirect_url', '')
    search_term = request.GET.get('search_term', '')

    if not redirect_url or not search_term:
        answer = "Bad request: no GET parameter URL."
        return HttpResponseBadRequest(answer)

    try:
        onion = utils.extract_domain_from_url(redirect_url)
        if is_valid_full_onion_url(redirect_url):
            # currently we can't log i2p clicks due to
            # SearchResultsClick.onion_domain having an onion validator
            # Also we don't have yet i2p results in order to test it
            SearchResultsClick.objects.add_or_increment(
                onion_domain=onion,
                clicked=redirect_url,
                search_term=search_term)
    except Exception as error:
        logger.error("Error with redirect URL: {0}\n{1}".format(
            redirect_url, error))

    message = "Redirecting to hidden service."
    return redirect_page(message, 0, redirect_url)
Esempio n. 6
0
    def build_adjacency_graph(self, documents):
        """
        Constructs adjacency graph for outgoing links, saves to self.adj_graph

        :param documents: An iterable that contains the ES documents.
            Preferably a generator to reduce RAM usage.
        :return: adjacency matrix
        :rtype: ``list`` of tuples
        """
        documents = documents or self.documents
        adj_graph = []

        for doc in documents:
            source = doc['_source']

            if 'domain' in source:
                origin = source['domain']
            elif 'source' in source:
                origin = source['source']
            else:
                logger.info('rank_pages: Unable to process: %s' % source)
                continue

            origin = utils.extract_domain_from_url(origin)
            if is_valid_onion(origin):
                origin_idx = self._get_domain_idx(origin)

                links = source.get('links', [])  # crawled case
                if 'target' in source:  # anchor case
                    links.append({'link': source['target']})
                for l in links:
                    url = l['link']
                    if is_valid_full_onion_url(url):
                        destiny = utils.extract_domain_from_url(url)
                        if destiny != origin:
                            destiny_idx = self._get_domain_idx(destiny)
                            adj_graph.append((origin_idx, destiny_idx))

        self.num_links = len(adj_graph)  # total links
        adj_graph = set(adj_graph)  # count only 1 edge of source->destiny
        self.num_edges = len(adj_graph)  # unique links

        return adj_graph