def _build_adjacency_graph(self, entries, domains): """ Constructs adjacency graph for outgoing links, saves to self.adj_graph todo: mv ES entries/documents parsing in separate function? :param entries: An iterable that contains the ES entries. Preferably a generator to reduce RAM usage. :param domains: If non-empty then inspect only the links that point to webpages under these domains :return: adjacency matrix :rtype: ``list`` of tuples """ entries = entries or self.entries adj_graph = [] for e in entries: if '_source' in e: # if called by `calc_page_pop.py` then `e` is an ES document e = e['_source'] if 'domain' in e: # entry from crawled page origin = e['domain'] elif 'source' in e: # entry from anchor text origin = e['source'] else: logger.warning('rank_pages: Unable to process: %s' % e) continue origin = utils.extract_domain_from_url(origin) if is_valid_onion(origin): origin_idx = self._get_domain_idx(origin) links = e.get('links', []) # crawled case if 'target' in e: # anchor case links.append({'link': e['target']}) for l in links: # ignore SE-spam links if is_legit_link(l, e): url = l['link'] if is_valid_full_onion_url(url): destiny = utils.extract_domain_from_url(url) # if domains non-empty ignore any other origins if not domains or destiny in domains: if destiny != origin: destiny_idx = self._get_domain_idx(destiny) adj_graph.append((origin_idx, destiny_idx)) self.num_links = len(adj_graph) # total links adj_graph = set(adj_graph) # count only 1 edge of source->destiny self.num_edges = len(adj_graph) # unique links return adj_graph
def onion_redirect(request): """Add clicked information and redirect to .onion address.""" redirect_url = request.GET.get('redirect_url', '') search_term = request.GET.get('search_term', '') #Checks for "malicious" URI-schemes that could lead to XSS #Malicious user is redirected on a 403 error page #if the previous checks replace a malicious URI-scheme if not xss_safe(redirect_url): answer = "Bad request: undefined URI-scheme provided" return HttpResponseBadRequest(answer) if not redirect_url or not search_term: answer = "Bad request: no GET parameter URL." return HttpResponseBadRequest(answer) try: onion = utils.extract_domain_from_url(redirect_url) if is_valid_full_onion_url(redirect_url): # currently we can't log i2p clicks due to # SearchResultsClick.onion_domain having an onion validator # Also we don't have yet i2p results in order to test it SearchResultsClick.objects.add_or_increment( onion_domain=onion, clicked=redirect_url, search_term=search_term) except Exception as error: logger.error("Error with redirect URL: {0}\n{1}".format( redirect_url, error)) message = "Redirecting to hidden service." return redirect_page(message, 0, redirect_url)
def onion_redirect(request): """Add clicked information and redirect to .onion address.""" redirect_url = request.GET.get('redirect_url', '') search_term = request.GET.get('search_term', '') if not redirect_url or not search_term: answer = "Bad request: no GET parameter URL." return HttpResponseBadRequest(answer) try: onion = utils.extract_domain_from_url(redirect_url) if is_valid_full_onion_url(redirect_url): # currently we can't log i2p clicks due to # SearchResultsClick.onion_domain having an onion validator # Also we don't have yet i2p results in order to test it SearchResultsClick.objects.add_or_increment( onion_domain=onion, clicked=redirect_url, search_term=search_term) except Exception as error: logger.error("Error with redirect URL: {0}\n{1}".format( redirect_url, error)) message = "Redirecting to hidden service." return redirect_page(message, 0, redirect_url)
def build_adjacency_graph(self, documents): """ Constructs adjacency graph for outgoing links, saves to self.adj_graph :param documents: An iterable that contains the ES documents. Preferably a generator to reduce RAM usage. :return: adjacency matrix :rtype: ``list`` of tuples """ documents = documents or self.documents adj_graph = [] for doc in documents: source = doc['_source'] if 'domain' in source: origin = source['domain'] elif 'source' in source: origin = source['source'] else: logger.info('rank_pages: Unable to process: %s' % source) continue origin = utils.extract_domain_from_url(origin) if is_valid_onion(origin): origin_idx = self._get_domain_idx(origin) links = source.get('links', []) # crawled case if 'target' in source: # anchor case links.append({'link': source['target']}) for l in links: url = l['link'] if is_valid_full_onion_url(url): destiny = utils.extract_domain_from_url(url) if destiny != origin: destiny_idx = self._get_domain_idx(destiny) adj_graph.append((origin_idx, destiny_idx)) self.num_links = len(adj_graph) # total links adj_graph = set(adj_graph) # count only 1 edge of source->destiny self.num_edges = len(adj_graph) # unique links return adj_graph