def search(persona): queries = persona.get('search', {}).get('queries', []) terms = persona.get('search', {}).get('terms', []) print("search: {} {}x queries {}x terms".format( PERSONA, len(queries), len(terms))) searches = 0 urlsToVisit = set() parser = HtmlParser() for url in [query.format(term) for query in queries for term in terms]: text, links = parser.get_text_links(url) searches += 1 for link in links: if link not in urlsToVisit: urlsToVisit.add(link) print("{} / {} {} {}x {}x".format( searches, len(queries) * len(terms), printable(url), len(links), len(urlsToVisit)))
def search(persona): queries = persona.get('search', {}).get('queries', []) terms = persona.get('search', {}).get('terms', []) print("search: {} {}x queries {}x terms".format(PERSONA, len(queries), len(terms))) searches = 0 urlsToVisit = set() parser = HtmlParser() for url in [query.format(term) for query in queries for term in terms]: text, links = parser.get_text_links(url) searches += 1 for link in links: if link not in urlsToVisit: urlsToVisit.add(link) print("{} / {} {} {}x {}x".format(searches, len(queries) * len(terms), printable(url), len(links), len(urlsToVisit)))
def browse(persona, maxVisits=None, direction=None): seedUrls = persona.get('browse', {}).get('seeds', []) whiteList = persona.get('browse', {}).get('whites', []) blackList = persona.get('browse', {}).get('blacks', []) if maxVisits == None: maxVisits = math.inf if direction not in ['breath-first', 'depth-first', 'random-walk']: direction = 'auto' print("browse: {} {}x max {} {}x seeds {}x mandatory {}x forbidden".format( PERSONA, maxVisits, direction, len(seedUrls), len(whiteList), len(blackList))) urlsToVisit = deque(seedUrls) urlsVisited = set() parser = HtmlParser() while urlsToVisit and len(urlsVisited) < maxVisits: url = urlsToVisit.popleft() site = urlparse(url).netloc text, links = parser.get_text_links(url) urlsVisited.add(url) if not any(black.lower() in text.lower() for black in blackList): for link in links: parts = urlsplit(link) basic = (parts.scheme, parts.netloc, parts.path, '', '') link = urlunsplit(basic) #print(link) if link not in urlsVisited and link not in urlsToVisit: if direction == 'auto': if urlparse(link).netloc == site: urlsToVisit.insert(randint(0, len(urlsToVisit)), link) # same -> random #urlsToVisit.append(link) # same -> last else: urlsToVisit.appendleft(link) # different -> first elif direction == 'breath-first': urlsToVisit.append(link) # breath first elif direction == 'depth-first': urlsToVisit.appendleft(link) # depth first else: urlsToVisit.insert(randint(0, len(urlsToVisit)), link) # random walk if len(urlsVisited) % 100 == 0: print("{} / {} {} {}x".format(len(urlsVisited), len(urlsToVisit), printable(url), len(links)))
def browse(persona, maxVisits = None, direction = None): seedUrls = persona.get('browse', {}).get('seeds', []) whiteList = persona.get('browse', {}).get('whites', []) blackList = persona.get('browse', {}).get('blacks', []) if maxVisits == None: maxVisits = math.inf if direction not in ['breath-first', 'depth-first', 'random-walk']: direction = 'auto' print("browse: {} {}x max {} {}x seeds {}x mandatory {}x forbidden".format( PERSONA, maxVisits, direction, len(seedUrls), len(whiteList), len(blackList))) urlsToVisit = deque(seedUrls) urlsVisited = set() parser = HtmlParser() while urlsToVisit and len(urlsVisited) < maxVisits: url = urlsToVisit.popleft() site = urlparse(url).netloc text, links = parser.get_text_links(url) urlsVisited.add(url) if not any(black.lower() in text.lower() for black in blackList): for link in links: parts = urlsplit(link) basic = (parts.scheme, parts.netloc, parts.path, '', '') link = urlunsplit(basic) #print(link) if link not in urlsVisited and link not in urlsToVisit: if direction == 'auto': if urlparse(link).netloc == site: urlsToVisit.insert(randint(0, len(urlsToVisit)), link) # same -> random #urlsToVisit.append(link) # same -> last else: urlsToVisit.appendleft(link) # different -> first elif direction == 'breath-first': urlsToVisit.append(link) # breath first elif direction == 'depth-first': urlsToVisit.appendleft(link) # depth first else: urlsToVisit.insert(randint(0, len(urlsToVisit)), link) # random walk print("{} / {} {} {}x".format( len(urlsVisited), len(urlsToVisit), printable(url), len(links)))