web_scraper.py

# Accepts the graph	generated by the web crawler and generates an index
# that	represents	the	content	on	the	set	of	the	pages and the URLs at which	each content element
# was found.
import page_rank
import urllib2
import ssl
import re

# required to gain access to https urls on the school server from home
ssl._create_default_https_context = ssl._create_unverified_context

index = {}

def get_page_text(url, index, word_count=0):
    response = urllib2.urlopen(url)
    html = response.read()

    page_text, page_words = "", []
    html = html[html.find("<body") + 5:html.find("</body>")]

    open_script_tag = html.find("<script")
    while open_script_tag > -1:
        end_script_tag = html.find("</script>")
        html = html[:open_script_tag] + html[end_script_tag + 9:]
        open_script_tag = html.find("<script")

    ignore_words = []
    fin = open("ignoreList.txt", "r")
    for word in fin:
        ignore_words.append(word.strip())
    fin.close()

    finished = False
    while not finished:
        next_close_tag = html.find(">")
        next_open_tag = html.find("<")
        if next_open_tag > -1:
            content = " ".join(html[next_close_tag + 1:next_open_tag].strip().split())
            page_text = page_text + " " + content
            html = html[next_open_tag + 1:]
        else:
            finished = True

    for word in page_text.split():
        word = word.lower()
        word = re.sub(r'[^\w\s]', '', word)  # remove punctuation

        if word[0].isalnum() and not word in ignore_words:
            page_words.append(word)

    for word in page_words:
        word_count += 1
        add_to_index(index, word, url, word_count)


def add_to_index(index, keyword, url, word_count):
    # for each url create a list and add the url and word position to it.
    if keyword in index:
        # handle if keyword already exists
        second_dict = index[keyword]
        if url in second_dict:
            values = second_dict[url]
            values.append(word_count)
        else:
            second_dict[url] = [word_count]

    else:
        # create a new dict and append to index
        second_dict = {}
        second_dict[url] = [word_count, ]
        index[keyword] = second_dict


def scrape_page(crawled_graph):
    checked_urls = []

    for url in crawled_graph:
        if url not in checked_urls:
            get_page_text(url, index)
            checked_urls.append(url)

    # calculate the page ranks using the crawled graph
    page_rank.compute_ranks(crawled_graph)


def get_index():
    return index