def readPagesByTerm(term):
	pages = Pages()
	# getting the top similar pages for the searched term
	main_term_id = str(pages.get_id(term))
	sim_pages = pages.query_page(main_term_id)
	top_similar_pages = sorted(sim_pages, key=lambda x: x[1], reverse=True)[1:21]
	possible_page_ids = set()
	possible_page_ids.add(main_term_id)
	value_of_list_item = main_term_id +',' + main_term_id + ',0.0'
	# add the possible page to the set
	page_page_set = set()
	page_page_set.add(value_of_list_item)
	# Save the page-page similarity values in dict
	pages_pages_association_dict = dict()
	pages_pages_association_dict[main_term_id] = dict()
	for page_vals in sim_pages:
		pages_pages_association_dict[main_term_id][str(page_vals[0])] = page_vals[1]
		pages_pages_association_dict[page_vals[0]] = {	str(page_vals[0]): page_vals[1]}
	print ("main term", main_term_id)
	# for the pages that are similar to the search term(first layer similar pages), get the top similar page 
	for page in top_similar_pages:
		possible_page_ids.add(str(page[0]))
		if str(page[0]) not in pages_pages_association_dict:
			pages_pages_association_dict[str(page[0])] = dict()
		second_layer_pages = pages.query_page(page[0])
		second_layer_pages_top = sorted(second_layer_pages, key=lambda x: x[1], reverse=True)[1:21]
		for associated_page in second_layer_pages_top:
			possible_page_ids.add(str(associated_page[0]))
		for associated_page in second_layer_pages:
			if str(associated_page[0]) not in pages_pages_association_dict:
				pages_pages_association_dict[str(associated_page[0])] = dict()
			pages_pages_association_dict[str(page[0])][str(associated_page[0])] = associated_page[1]
			pages_pages_association_dict[str(associated_page[0])][str(page[0])] = associated_page[1]
	# form all the page-page similarity values	
	for page1 in possible_page_ids:
		for page2 in possible_page_ids:
			if page1 == page2:
				# if page1 and page2 doesn't have any similarty value, the similarity values is assigned to 0
				value = page1 + ',' + page2 +',' + '0.0'
				page_page_set.add(value)
			else:
				if page1  in pages_pages_association_dict and page2 in pages_pages_association_dict[page1]:
					value = str(page1) + ',' + str(page2) +',' + str(pages_pages_association_dict[str(page1)][str(page2)])
				else:
					value = str(page1) + ',' + str(page2) +',0.0'
				page_page_set.add(value)
				if page2 in pages_pages_association_dict and page1 in pages_pages_association_dict[page2]:
					value = str(page2) + ',' + str(page1) + ','+ str(pages_pages_association_dict[str(page2)][str(page1)])
				else:
					value = str(page2) + ',' + str(page1) + ',0.0'
				page_page_set.add(value)
	# printing out the pages in similarity matrx
	print (possible_page_ids)
	print (len(pages_pages_association_dict))
	print (pages_pages_association_dict.keys())
	# Return the page-page similarity matrx
	return list( page_page_set)
Ejemplo n.º 2
0
def graph(term):
	#make network graph with depth = 2 for term
	pages = Pages()
	id = pages.get_id(term)
	sim_pages = pages.query_page(id)

	#make networkx graph object
	G = nx.Graph()
	node_dict = {}
	title = pages.get_title(int(id))
	G.add_node(title)
	#original term is red
	node_dict[title] = ('red', 0.0)

	#add top 10 pages similar to term
	short_sim_pages = sorted(sim_pages, key=lambda x: x[1], reverse=True)[1:11]
	for page in short_sim_pages:
		page_name = pages.get_title(int(page[0]))
		G.add_node(page_name)
		#depth = 1 terms are blue
		node_dict[page_name] = ('blue', float(page[1]))
		G.add_edge(title,page_name)
		addl_nodes = add_x_nodes(pages, G, page[0], 10)
		#add next depth of pages
		for node in addl_nodes:
			if node not in node_dict:
				#depth = 2 terms are green
				node_dict[node] = ('green', float(page[1]))

	node_list = []
	node_color = []
	node_size = []
	#make node size proportional to similarity score of each node to original term
	for node in node_dict:
		node_list.append(node)
		node_color.append(node_dict[node][0])
		node_size.append(node_dict[node][1])
	node_size = [x*300/max(node_size) for x in node_size]
	node_size[node_size.index(0.0)]=300

	#draw and save network figure
	nx.draw_networkx(G=G,with_labels=True,nodelist=node_list,node_size=node_size,node_color=node_color,font_size=8)
	fig_name = "%s.png" %(term)
	plt.savefig(fig_name)
def getDifferenceWikiPageRankLists(search_term, rank_list):
	pages = Pages()
	page_id = pages.get_id(search_term)
	wiki_page_links = set(pages.getPageLinksList(int(page_id)))
	rank_name_list = [pages.get_title(int(ranked_page[0])).lower() for ranked_page in rank_list]
	rank_set = set(rank_name_list)
	# get the links in both ranked list and wikipedea links
	print ("Intersection", rank_set.intersection(wiki_page_links))
	# get the links that exist only in ranked list
	print ("Difference", rank_set.difference(wiki_page_links))
	page_summary, page_content = pages.getPageSummaryContent(page_id)
	# get ranked pages that exist in summary and content of page
	links_in_summary_only = set()
	links_in_content_only = set()
	links_in_both = set()
	for rank_page in rank_name_list:
		in_summary = False
		in_content = False
		if re.search(r'\b%s\b' % rank_page, page_summary):
			in_summary = True
		if re.search(r'\b%s\b' % rank_page, page_content):
			in_content = True
		if in_summary and in_content:
			links_in_both.add(rank_page)
		elif in_summary:
			links_in_summary_only.add(rank_page)
		elif in_content:
			links_in_content_only.add(rank_page)
	# print out the summary for the comparison between the wikilinks, content and the ranked list
	print ("inks_in_both", links_in_both)
	print ("links_in_summary_only", links_in_summary_only)
	print ("links_in_content_only", links_in_content_only)
	PAGES_IN_WIKI_LINKS_AND_CONTENT = links_in_both.union(rank_set.intersection(wiki_page_links))
	print ("pages in both links and the content", PAGES_IN_WIKI_LINKS_AND_CONTENT)
	top_ranked_50 = set(rank_name_list[0:50])
	print ("print intersection with top ranks 50 and wiki", top_ranked_50.intersection(PAGES_IN_WIKI_LINKS_AND_CONTENT))
	print ("pages in top 50 and not in common set", top_ranked_50.difference(PAGES_IN_WIKI_LINKS_AND_CONTENT))
def getTopSimilarPagesByID(page_id):
	pages = Pages()
	sim_pages = pages.query_page(page_id)
	top_similar_pages = sorted(sim_pages, key=lambda x: x[1], reverse=True)[1:11]
	return top_similar_pages
            lambda url_urls_rank: computeContribsAssociation(url_urls_rank[1][0], url_urls_rank[1][1]))

        # Re-calculates page similarity ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(add).mapValues(lambda rank: rank * 0.85 + 0.15/pages_num)
	# loop until it converge or it reaches the maximum number of iteration 
	if iteration > 0:
		print ("ieteration", iteration)
		converged_count = 0
		converge_norm = 0
		for (page_id, rank_value) in ranks.collect():
			if rank_value - prev_rank.lookup(page_id)[0] < 0.001:
				converged_count += 1
			converge_norm += (rank_value - prev_rank.lookup(page_id)[0])
		print ("converge.count()", converged_count)
		# If it converges, break
		if converge_norm <= 0.001:
			print ("Converged after ", iteration)
			break
	# store the current ranks to compare tem to the ranks of the next iterations		
	prev_rank = ranks
    # sort the pages based on the ranks similarity
    sorted_ranks = sorted(ranks.collect(), key = itemgetter(1), reverse=True)
    # Output the similarity ranks
    load_page = Pages()
    max_rank = sorted_ranks[0][1]
    for (link, rank) in sorted_ranks:
        print("%s,%s" % (load_page.get_title(int(link)), rank/max_rank))
    # get the difference between wikipedia links and similarty ranking list, and check if the ranked links exist in the content
    getDifferenceWikiPageRankLists(search_term, sorted_ranks)
    sc.stop()