Beispiel #1
0
def populate_inlinks_dict(lines):
    try:
        for line in lines:
            words = line.split()
            M[words[0]] = words[1:]  # populating the inlinks dictionary
    except Exception as e:
        logerror(e)
Beispiel #2
0
def start():
    try:
        inlinks_file_path = raw_input(
            "> Enter the path to the inlink graph file with proper extension. \n>\t"
        )
        inlinks_file = open(inlinks_file_path, "r")
        lines = inlinks_file.readlines()
        populate_inlinks_dict(lines)
        populate_outlinks_dict()
        populate_sink_pages()
        populate_page_rank()
        calc_page_rank()
        sort_pages_rank()
        sort_pages_inlink()
        print "================================================================="
        print "========================STATISTICS=============================== \n"
        sinks = number_of_sinks()
        sources = number_of_sources()
        total_pages = len(M.keys())
        print("Total number of pages = " + str(total_pages))
        print("Proportion of Sinks = " + str(sinks / float(len(M.keys()))))
        print("Proportion of Sources = " + str(sources / float(len(M.keys()))))
        print "================================================================="
        print "================================================================="

    except Exception as e:
        logerror(e)
Beispiel #3
0
def fetch_all_links(current_crawl, keyword):
    base_url = "https://en.wikipedia.org"
    pattern = re.compile('^/wiki/')
    new_links_list = []
    try:
        html_content = get_html_content(current_crawl)
        #stores all the links within the bodyContent of the page excluding the administrative links
        #and the 1st regex excludes any links containing # or any other links starting other than wiki
        links = html_content.find("div", {
            "id": "bodyContent"
        }).find_all('a', href=pattern)
        for link in links:
            if ":" not in link.get('href'):
                url = urlparse.urljoin(base_url, link.get('href'))
                if "#" in link.get('href'):
                    url = url[:url.index('#')]
                anchor = link.text.encode("utf-8")
                match = re.search(r'.*{0}.*'.format(keyword), url, re.I)
                keywordsearch = re.search(r'.*{0}.*'.format(keyword), anchor,
                                          re.I)
                if (url not in new_links_list) and (match or keywordsearch):
                    new_links_list.append(url)
                    LINK_ANCHOR.update({url: anchor})
    except Exception as e:
        logerror(e)
    return new_links_list
Beispiel #4
0
def cal_perplexity():
    try:
        entropy = 0.0  # initiating the value of entropy to 0.0
        for p in PR.keys():
            entropy -= PR[p] * log(PR[p], 2)
        return 2**entropy
    except Exception as e:
        logerror(e)
Beispiel #5
0
def populate_sink_pages():
    try:
        for p in M.keys():  # iterating through all the unique crawled pages
            if not L.has_key(
                    p
            ):  # checking whether the particular page p has any outlink
                S.append(p)
    except Exception as e:
        logerror(e)
Beispiel #6
0
def populate_page_rank():
    try:
        total_pages = float(
            len(M.keys())
        )  # making it float because the value will be required to calculate PR
        for p in M.keys():
            PR[p] = 1.0 / total_pages  #Initiating the value of page rank for each page as 1/total number of pages
    except Exception as e:
        logerror(e)
Beispiel #7
0
def number_of_sources():
    try:
        c = 0
        for p in M:
            if not M[p]:
                c += 1
        print("Number of Sources: " + str(c))
        return float(c)
    except Exception as e:
        logerror(e)
Beispiel #8
0
def populate_outlinks_dict():
    try:
        for p in M.keys():  # iterating through all the unique crawled pages
            for q in M.get(
                    p):  # iterating through the list of inlinks for p page
                if L.has_key(
                        q
                ):  # checking whether the particular inlink is already in outlink dict
                    L[q] += 1  # if the particular page q already exists in outlink dict, then increment the counter
                else:
                    L[q] = 1  # if the particular page q does not exist in outlink dict, then initiate the counter from 1
    except Exception as e:
        logerror(e)
Beispiel #9
0
def sort_pages_rank():
    try:
        spr = {}
        spr = sorted(PR.iteritems(), key=operator.itemgetter(1), reverse=True)
        sorted_file = open("SortedPage.txt", "a")
        if (len(spr) < 50):
            for sp in range(len(spr)):
                sorted_file.write(str(spr[sp]) + "\n")
        else:
            for sp in range(50):
                sorted_file.write(str(spr[sp]) + "\n")
        sorted_file.close()
    except Exception as e:
        logerror(e)
Beispiel #10
0
def sort_pages_inlink():
    try:
        inlink_rank = {}
        for p in M:
            inlink_rank[p] = len(M.get(p))
        inlink_rank = sorted(inlink_rank.iteritems(),
                             key=operator.itemgetter(1),
                             reverse=True)
        inlink_file = open("InlinkPageRank.txt", "a")
        for ir in range(5):
            inlink_file.write(str(inlink_rank[ir]) + " \n")
        inlink_file.close()
    except Exception as e:
        logerror(e)
Beispiel #11
0
def get_html_content(url):
    try:
        html = urllib2.urlopen(url)
        content = BeautifulSoup(html,"html.parser")
        content.prettify()
        if LINK_ANCHOR.has_key(url):
            file_name= re.sub(r'[\W]', '_', LINK_ANCHOR[url])#replacing space from anchor name
        else:
            file_name= re.sub(r'[\W]', '_', url)#replacing special characters
        LINK_FILENAME.update({url:file_name+".html"})
        out_file  = open(CRAWLED_HTML_PATH+"\\"+file_name+".html",'w')
        out_file.write(url.encode('UTF-8')+"\n"+content.prettify().encode('UTF-8'))
        out_file.close()
        html.close()
        return content
    except Exception as e:
        logerror(e)
Beispiel #12
0
def start():
    try:
        remove_files()
        depth = raw_input("> Enter the depth where depth starts from 1 \n>\t")
        crawled_limit = raw_input("> Enter the number limit of crawled urls\n>\t")
        seed_url="https://en.wikipedia.org/wiki/Sustainable_energy"
        crawledlist = web_crawl(seed_url,int(depth),int(crawled_limit))
        if crawledlist:
            out_file = open(CRAWLEDLISTPATH+"\crawled_list.txt",'a')
            count=1
            for url in crawledlist:
                out_file.write(str(count)+".\t" +url + "\n")
                count+=1
            print "The crawled_list.txt can be found in %s" %(CRAWLEDLISTPATH)
            print "The crawled list of htmls can be found in %s" %(CRAWLED_HTML_PATH)
            out_file.close()
        print "The error file can be found at %s" %(ERROR_FILE_PATH)
    except Exception as e:
        logerror(e)
Beispiel #13
0
def web_crawl(seed, max_depth,crawled_limit):
    frontier_crawl = [seed]#this list maintains
    visited = []
    next_depth_urls = [] #this list stores all unique urls for each depth
    depth = 1 #where seedpage is depth 1.
    try:
        while frontier_crawl and depth <= max_depth and len(visited) < crawled_limit:
            current_crawl = frontier_crawl.pop(0) #crawls the links from the top of the page
            if current_crawl not in visited:
               new_url_links = fetch_all_links(current_crawl)
               if new_url_links is not None:
                   merge_results(next_depth_urls, new_url_links)
                   visited.append(current_crawl)
                   time.sleep(1) #waiting policy for 1 second.
            if not frontier_crawl:
               frontier_crawl, next_depth_urls = next_depth_urls, [] #once frontrier or current depth list is empty, going to next depth.
               depth += 1
    except Exception as e:
        logerror(e)
        return visited
    return visited
Beispiel #14
0
def calc_page_rank():
    try:
        # total_pages is the total number of unique crawled pages
        total_pages = float(
            len(M.keys())
        )  # making it float because the value will be required to calculate PR
        # d is the dammping factor/teleportation factor
        d = 0.85
        perplexity = 0.0  #initial value of perplexity
        convergence_count = 0  #initializing the convergence_count to 0
        iteration_count = 0  # will track the total number of iterations required to converge
        while convergence_count < 4:
            sinkPR = 0.0
            for p in S:
                sinkPR += PR[p]
            for p in M.keys():
                NEWPR[p] = (1.0 - d) / total_pages  #teleportation factor
                NEWPR[p] += (d * sinkPR / total_pages
                             )  # spreading the remaining sinkPR evenly
                for q in M[p]:  #traversing through the inlinks of page p
                    NEWPR[p] += d * PR[q] / L[
                        q]  # add share of PageRank from inlinks
            for page in M.keys():
                PR[page] = NEWPR[page]  # setting the new PageRank
            new_perplexity = cal_perplexity()
            if abs(new_perplexity - perplexity) < 1.0:
                convergence_count += 1
            else:
                convergence_count = 0
            perplexity = new_perplexity
            iteration_count += 1
            outfile = open("perplexity_per_round.txt", "a")
            outfile.write("Perplexity value: " + str(perplexity) +
                          " for the iteration: " + str(iteration_count) + "\n")
            outfile.close

    except Exception as e:
        logerror(e)
Beispiel #15
0
def get_html_content(url):
    try:
        counter= 1;
        html = urllib2.urlopen(url)
        content = BeautifulSoup(html,"html.parser")
        content.prettify()
        article_id = (url.split("/wiki/")[1])
        file_name=re.sub(r'-*_*', '', article_id)
        if file_name not in LINK_FILENAME:
            LINK_FILENAME.append(file_name)
        else:
            while(file_name in LINK_FILENAME):
                file_name = file_name+str(counter)
                counter = counter + 1
            LINK_FILENAME.append(file_name)
        out_file  = open(CRAWLED_HTML_PATH+"\\"+file_name+".txt",'w')
        out_file.write(url.encode('UTF-8')+"\n"+content.prettify().encode('UTF-8'))
        out_file.close()
        html.close()
        return content
    except Exception as e:
        logerror(e)
        return None
Beispiel #16
0
def web_crawl(url, max_depth, crawled_limit, keyword, visited):
    current_depth = 1
    try:
        if current_depth <= max_depth and len(visited) < crawled_limit:
            frontier_crawl = fetch_all_links(url, keyword)
            if url not in visited:
                visited.append(url)
            for new_url in frontier_crawl:
                if len(visited) < crawled_limit and max_depth > current_depth:
                    if new_url not in visited:
                        merge_results(
                            visited,
                            web_crawl(new_url, max_depth - 1, crawled_limit,
                                      keyword, visited))
                else:
                    break
            time.sleep(1)  #waiting policy for 1 second.
        else:
            return []
    except Exception as e:
        logerror(e)
        return visited
    return visited
Beispiel #17
0
def number_of_sinks():
    try:
        print("Number of Sinks: " + str(len(S)))
        return float(len(S))
    except Exception as e:
        logerror(e)