Example #1
0
def test():
    input = slurp("/Users/andrewkittredge/Source/ai/similar_page/corpora/friendly_rentals.txt")
    cleansed_input = cleanse(input)
    n_grams = word_n_grams(cleansed_input, 1)
    corpa = Corpa()
    corpa.add_n_grams(n_grams)
    corpa.variance_from_model()
def process_url(url):
    page = get_page_contents(url)
    page_soup = BeautifulSoup(page)
    page_text = visible_text(page_soup)
    page_text = cleanse(page_text)
    try:
        urls = [anchor['href'].strip() for anchor in page_soup.findAll('a') if anchor.has_key('href')]
    except Exception as e:
        print e
    return page_text, urls