Example #1
0
def search(persona):
    queries = persona.get('search', {}).get('queries', [])
    terms = persona.get('search', {}).get('terms', [])
    print("search: {} {}x queries {}x terms".format(
          PERSONA, len(queries), len(terms)))
    searches = 0
    urlsToVisit = set()
    parser = HtmlParser()
    for url in [query.format(term) for query in queries for term in terms]:
        text, links = parser.get_text_links(url)
        searches += 1
        for link in links:
            if link not in urlsToVisit:
                urlsToVisit.add(link)
        print("{} / {} {} {}x {}x".format(
              searches, len(queries) * len(terms), printable(url), len(links), len(urlsToVisit)))
Example #2
0
def search(persona):
    queries = persona.get('search', {}).get('queries', [])
    terms = persona.get('search', {}).get('terms', [])
    print("search: {} {}x queries {}x terms".format(PERSONA, len(queries),
                                                    len(terms)))
    searches = 0
    urlsToVisit = set()
    parser = HtmlParser()
    for url in [query.format(term) for query in queries for term in terms]:
        text, links = parser.get_text_links(url)
        searches += 1
        for link in links:
            if link not in urlsToVisit:
                urlsToVisit.add(link)
        print("{} / {} {} {}x {}x".format(searches,
                                          len(queries) * len(terms),
                                          printable(url), len(links),
                                          len(urlsToVisit)))
Example #3
0
def browse(persona, maxVisits=None, direction=None):
    seedUrls = persona.get('browse', {}).get('seeds', [])
    whiteList = persona.get('browse', {}).get('whites', [])
    blackList = persona.get('browse', {}).get('blacks', [])
    if maxVisits == None:
        maxVisits = math.inf
    if direction not in ['breath-first', 'depth-first', 'random-walk']:
        direction = 'auto'
    print("browse: {} {}x max {} {}x seeds {}x mandatory {}x forbidden".format(
        PERSONA, maxVisits, direction, len(seedUrls), len(whiteList),
        len(blackList)))
    urlsToVisit = deque(seedUrls)
    urlsVisited = set()
    parser = HtmlParser()
    while urlsToVisit and len(urlsVisited) < maxVisits:
        url = urlsToVisit.popleft()
        site = urlparse(url).netloc
        text, links = parser.get_text_links(url)
        urlsVisited.add(url)
        if not any(black.lower() in text.lower() for black in blackList):
            for link in links:
                parts = urlsplit(link)
                basic = (parts.scheme, parts.netloc, parts.path, '', '')
                link = urlunsplit(basic)
                #print(link)
                if link not in urlsVisited and link not in urlsToVisit:
                    if direction == 'auto':
                        if urlparse(link).netloc == site:
                            urlsToVisit.insert(randint(0, len(urlsToVisit)),
                                               link)  # same -> random
                            #urlsToVisit.append(link) # same -> last
                        else:
                            urlsToVisit.appendleft(link)  # different -> first
                    elif direction == 'breath-first':
                        urlsToVisit.append(link)  # breath first
                    elif direction == 'depth-first':
                        urlsToVisit.appendleft(link)  # depth first
                    else:
                        urlsToVisit.insert(randint(0, len(urlsToVisit)),
                                           link)  # random walk
        if len(urlsVisited) % 100 == 0:
            print("{} / {} {} {}x".format(len(urlsVisited), len(urlsToVisit),
                                          printable(url), len(links)))
Example #4
0
def browse(persona, maxVisits = None, direction = None):
    seedUrls = persona.get('browse', {}).get('seeds', [])
    whiteList = persona.get('browse', {}).get('whites', [])
    blackList = persona.get('browse', {}).get('blacks', [])
    if maxVisits == None:
        maxVisits = math.inf
    if direction not in ['breath-first', 'depth-first', 'random-walk']:
        direction = 'auto'
    print("browse: {} {}x max {} {}x seeds {}x mandatory {}x forbidden".format(
          PERSONA, maxVisits, direction, len(seedUrls), len(whiteList), len(blackList)))
    urlsToVisit = deque(seedUrls)
    urlsVisited = set()
    parser = HtmlParser()
    while urlsToVisit and len(urlsVisited) < maxVisits:
        url = urlsToVisit.popleft()
        site = urlparse(url).netloc
        text, links = parser.get_text_links(url)
        urlsVisited.add(url)
        if not any(black.lower() in text.lower() for black in blackList):
            for link in links:
                parts = urlsplit(link)
                basic = (parts.scheme, parts.netloc, parts.path, '', '')
                link = urlunsplit(basic)
                #print(link)
                if link not in urlsVisited and link not in urlsToVisit:
                    if direction == 'auto':
                        if urlparse(link).netloc == site:
                            urlsToVisit.insert(randint(0, len(urlsToVisit)), link) # same -> random
                            #urlsToVisit.append(link) # same -> last
                        else:
                            urlsToVisit.appendleft(link) # different -> first
                    elif direction == 'breath-first':
                        urlsToVisit.append(link) # breath first
                    elif direction == 'depth-first':
                        urlsToVisit.appendleft(link) # depth first
                    else:
                        urlsToVisit.insert(randint(0, len(urlsToVisit)), link) # random walk
        print("{} / {} {} {}x".format(
              len(urlsVisited), len(urlsToVisit), printable(url), len(links)))