def whooshOpen(query): ix = open_dir("../lab3/indexdir") results_dict = {} query = QueryParser('content', ix.schema).parse(query) with ix.searcher(weighting=scoring.TF_IDF()) as s_tf: tf_results = s_tf.search(query, limit=100) for r in tf_results: results_dict.setdefault(r.docnum, []).append(r.score) with ix.searcher(weighting=scoring.BM25F()) as s_bm: bm_results = s_bm.search(query, limit=100) for r in bm_results: results_dict.setdefault(r.docnum, []).append(r.score) ranks = pageRank.pageRank(pageRank.inverted_index, 0.85, 10) l = [] for (id, vals) in results_dict.iteritems(): if len(vals) == 2: l.append((vals[0], vals[1], ranks[id])) expected = start() ys = [] for (tf, bm, pr) in l: ys.append(bm * expected[0] + tf * expected[1] + pr * expected[2] + expected[3]) print ys
def plot(filename, minBeta, maxBeta, step): graph = pR.readGraph(filename) N = graph.getSize() betas = [] iterations = [] ranks = [[] for i in range(N)] while minBeta <= maxBeta: betas.append(minBeta) rks, its = pR.pageRank(graph, minBeta) iterations.append(its) for n in range(N): ranks[n].append(rks[n][-1]) minBeta += step fig, axis = plt.subplots(1, 2) #plt.figure() ax = axis[0] ax.errorbar(betas, iterations, fmt='o--', color='b', capthick=2) ax.set_title("PageRank #iterations \nfor graph " + filename.split(".")[0]) ax.set_xlabel("Beta values") ax.set_ylabel("Iterations") ax = axis[1] for n in range(N): ax.plot(betas, ranks[n], label="Node " + str(n + 1)) ax.set_title("PageRank values \nfor graph " + filename.split(".")[0]) ax.set_xlabel("Beta values") ax.set_ylabel("PageRank values") ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) fig.tight_layout() plt.savefig(filename.split(".")[0] + "_plot.png", bbox_inches='tight', dpi=100) plt.show()
def calculatePageRank(expansionMeshWordsCollect, progressControl, MeshTreeField): import pageRank meshTreeCodeDict = loadMeshTreeCode(MeshTreeField) #queryId = '20' meshTermAndCodeDictAll = {} meshTermPageRankValueAll = {} for queryId in expansionMeshWordsCollect: meshTermAndCodeDictAll[queryId] = {} meshTermPageRankValueAll[queryId] = {} #meshTermAndCodeDict = {} meshTermId = 0 for word in expansionMeshWordsCollect[queryId]: meshTermAndCodeDictAll[queryId][meshTermId] = [word, meshTreeCodeDict[word]] meshTermId += 1 meshTermId = 0 meshTermCount = len(meshTermAndCodeDictAll[queryId]) #support.printDict(meshTermAndCodeDict, 1) S = probMatrix(meshTermCount, meshTermAndCodeDictAll[queryId], progressControl) f = calculateOriginalPangRankValue(meshTermCount, meshTermAndCodeDictAll[queryId], expansionMeshWordsCollect[queryId]) U = [[1]*meshTermCount for row in range(meshTermCount)] n = meshTermCount alpha=1.0 pageRankValue = pageRank.pageRank(S, U, f, alpha, n) #print pageRankValue max = np.max(pageRankValue) for i in range(meshTermCount): word = meshTermAndCodeDictAll[queryId][i][0] #code = meshTermAndCodeDict[i][1] wordPageRankvalue = pageRankValue[i]/max #print queryId, word, wordPageRankvalue meshTermPageRankValueAll[queryId][word] = wordPageRankvalue #support.printDict(meshTermPageRankValueAll, 2) return meshTermPageRankValueAll
def main(): matriz_de_transicion, world_wide_web = crawler() tiempo_inicio = time.time() pagerank = pageRank(matriz_de_transicion) dt = {'Paginas': list(world_wide_web.values()), 'Rank': pagerank} print(emoji.emojize('Pagerank :thumbsup:', use_aliases=True)) df = pd.DataFrame(data=dt) print(df) df.to_csv('Pagerank.csv', encoding='utf-8', index=False)
def calculatePageRank(expansionMeshWordsCollect, progressControl, MeshTreeField): import pageRank meshTreeCodeDict = loadMeshTreeCode(MeshTreeField) #queryId = '20' meshTermAndCodeDictAll = {} meshTermPageRankValueAll = {} for queryId in expansionMeshWordsCollect: meshTermAndCodeDictAll[queryId] = {} meshTermPageRankValueAll[queryId] = {} #meshTermAndCodeDict = {} meshTermId = 0 for word in expansionMeshWordsCollect[queryId]: meshTermAndCodeDictAll[queryId][meshTermId] = [ word, meshTreeCodeDict[word] ] meshTermId += 1 meshTermId = 0 meshTermCount = len(meshTermAndCodeDictAll[queryId]) #support.printDict(meshTermAndCodeDict, 1) S = probMatrix(meshTermCount, meshTermAndCodeDictAll[queryId], progressControl) f = calculateOriginalPangRankValue(meshTermCount, meshTermAndCodeDictAll[queryId], expansionMeshWordsCollect[queryId]) U = [[1] * meshTermCount for row in range(meshTermCount)] n = meshTermCount alpha = 1.0 pageRankValue = pageRank.pageRank(S, U, f, alpha, n) #print pageRankValue max = np.max(pageRankValue) for i in range(meshTermCount): word = meshTermAndCodeDictAll[queryId][i][0] #code = meshTermAndCodeDict[i][1] wordPageRankvalue = pageRankValue[i] / max #print queryId, word, wordPageRankvalue meshTermPageRankValueAll[queryId][word] = wordPageRankvalue #support.printDict(meshTermPageRankValueAll, 2) return meshTermPageRankValueAll
#!/usr/bin/python import os.path from whoosh.index import create_in, open_dir from whoosh.fields import * from whoosh.qparser import * import pageRank ranks = pageRank.pageRank(pageRank.inverted_index, 0.85, 10) ix = open_dir("../lab3/indexdir") with ix.searcher() as searcher: query = QueryParser("content", ix.schema, group=OrGroup).parse(u"first document") results = searcher.search(query, limit=100) arr = [(r['id'], ranks[r['id']]*results.score(i)) \ for (i,r) in enumerate(results) if r['id'] in ranks.keys()] final = sorted(arr, key=lambda (_, val): val, reverse=True) for r in final: print ">", r
links = [[]] def read_file(filename): f = open(filename, 'r') for line in f: (frm, to) = map(int, line.split(" ")) extend = max(frm - len(links), to - len(links)) + 1 for i in range(extend): links.append([]) links[frm].append(to) f.close() fn = "1000.txt" read_file(fn) f = open("time-%s" % fn, 'w') for i in range(5): start = int(round(time.time() * 1000)) pr = pageRank(links, alpha=0.85, convergence=0.00001, checkSteps=10) used = int(round(time.time() * 1000)) - start f.writelines(["no.%d time used: %s ms\n" % (i, used)]) f.close() # sum = 0 # for i in range(len(pr)): # print i, "=", pr[i] # sum = sum + pr[i] # print "s = " + str(sum)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED # OF THE POSSIBILITY OF SUCH DAMAGE. import sys from pageRank import pageRank links = [[]] def read_file(filename): f = open(filename, 'r') for line in f: (frm, to) = map(int, line.split(" ")) extend = max(frm - len(links), to - len(links)) + 1 for i in range(extend): links.append([]) links[frm].append(to) f.close() read_file(sys.argv[1]) pr = pageRank(links, alpha=0.85, convergence=0.00001, checkSteps=10) sum = 0 for i in range(len(pr)): print i, "=", pr[i] sum = sum + pr[i] print "s = " + str(sum)
def computeGraph(self, counterFilter, tfidfFilter): logger.info('getGraph') # First list all Entities and convert them into Nodes allNodes = [] for doc in self.docList.docs: for entity in doc.entities: n = Node(entity.name, entity.type) n.frequency = entity.tfidf allNodes.append(n) if not entity.type in self.types: self.types.append(entity.type) # Merge same Nodes together and aggregate the TFIDF print "allNodes len: " + str(len(allNodes)) nodesDict = {} for node in allNodes: key = node.name + node.type if key not in nodesDict: nodesDict[key] = node else: #Aggregate nodesDict[key].counter += 1 nodesDict[key].frequency += node.frequency print "Nbr of unique Nodes: " + str(len(nodesDict)) #Compute the average TFIDF frequencies = [] counters = [] for key, node in nodesDict.iteritems(): node.frequency = int(100000*(node.frequency/node.counter)) nodesDict[key] = node frequencies.append(node.frequency) counters.append(node.counter) # Take only the best nodes frequencies.sort() # sorted by ascending order counters.sort() # sorted by ascending order numberOfNode = 40 print "len nodesDict " + str(len(nodesDict)) print "min freq" + str(frequencies[-numberOfNode]) print "min counter" + str(counters[-numberOfNode]) for key, node in nodesDict.items(): '''if node.frequency < frequencies[-numberOfNode]: del nodesDict[key] if node.counter < counters[-15]: del nodesDict[key]''' if node.counter < counterFilter or node.frequency < frequencies[-min(tfidfFilter,len(frequencies)-1)]: del nodesDict[key] print "len nodesDict filtered" + str(len(nodesDict)) # Set node's id i = 0 IdToPosition = {} for key, node in nodesDict.iteritems(): nodesDict[key].id = i # TODO can be optimized and node just before in 'uniqueNodes.append(node)' nodesDict[key].id = abs(hash(key)) % (10 ** 8) IdToPosition[ nodesDict[key].id ] = i i += 1 #Todo Update rank and frequency of Nodes ... #Create links with Weight linksDict = {} for doc in self.docList.docs: for i in range(len(doc.entities)): key1 = doc.entities[i].name + doc.entities[i].type if not key1 in nodesDict: # Entity not selected continue for j in range(i+1, len(doc.entities)): key2 = doc.entities[j].name + doc.entities[j].type if not key2 in nodesDict: # Entity not selected continue if nodesDict[key1] < nodesDict[key2]: if (nodesDict[key1].id,nodesDict[key2].id) in linksDict: linksDict[(nodesDict[key1].id,nodesDict[key2].id)][0] += 1 linksDict[(nodesDict[key1].id,nodesDict[key2].id)][1].append(doc.id) else: linksDict[(nodesDict[key1].id,nodesDict[key2].id)] = [1,[doc.id]] else: if (nodesDict[key2],nodesDict[key1]) in linksDict: linksDict[(nodesDict[key2].id,nodesDict[key1].id)][0] += 1 linksDict[(nodesDict[key2].id,nodesDict[key1].id)][0].append(doc.id) else: linksDict[(nodesDict[key2].id,nodesDict[key1].id)] = [1,[doc.id]] pass print "linksDict len: " + str(len(linksDict)) adjacency = np.zeros((len(nodesDict), len(nodesDict)))# Adjacency matrix links = [] for k, link in linksDict.viewitems(): if not k[0] == k[1]: links.append(Link(k[0],k[1],link[0],link[1])) adjacency[IdToPosition[k[0]]][IdToPosition[k[1]]] = link[0] adjacency[IdToPosition[k[1]]][IdToPosition[k[0]]] = link[0] print "links len: " + str(len(links)) pr = pageRank(adjacency, .85, .000001) for key, node in nodesDict.iteritems(): nodesDict[key].rank = pr[IdToPosition[nodesDict[key].id]] # Keep only link with weight >= 2 '''entitiesToKeep = [] for link in links: if link.weight <= 1: links = [x for x in links if x != link] #links.remove(link) else: entitiesToKeep.append(link.source) entitiesToKeep.append(link.target) # Keep only entity connected for key, node in nodesDict.items(): if not node.id in entitiesToKeep: del nodesDict[key]''' self.nodes = nodesDict.values() self.links = links
#seeds url_list = ['http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html'] url_list.append('http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html') url_list.append('http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html') crawl = Crawler(url_list) def activate_crawl(): if crawl.crawl_complete: print('website crawl complete') else: crawl.downloader() crawl.parser() crawl.frontier() #crawl feedback print('link temporary:\t\t', sorted(crawl.link_temporary)) print('link set:\t\t\t', sorted(crawl.link_set)) print('page_rank_graph:\t', crawl.page_rank_graph) print('not crawled:\t\t',crawl.url_seed) print('NEW RUN ++++++++++++++++++++++++++++++++++++++') activate_crawl() activate_crawl() page_rank = pageRank(crawl.page_rank_graph) print(page_rank.returnGraph()) print('number_of_sites:\t\t', page_rank.calc_number_of_sites())
for doc_sentences in sent_proportions: for k, v in doc_sentences.iteritems(): sent_outfile.write( k.replace('\n', '') + "\t" + '\t'.join(str(x) for x in v) + "\n") sent_entities = [] i = 0 ranked_finals = '' entityDict, sent_entity = getSentenceEntityList('sent_proportions.txt') #for k,v in sent_entity.iteritems(): # print k,v # sent_entities.append(v) pageRankInput = inputforPageRank(sent_entity) raw_input() norm_pageRank = pageRank(pageRankInput, s=.86) ranked_sentences = [] #print norm_pageRank ranked_final = rankbyPageRank(norm_pageRank) for every_el in ranked_final: ranked_sentences.append(every_el[0]) #ranked_sentences = set(ranked_sentences) #print ranked_sentences #raw_input() unique_sentences = [] for rs in ranked_sentences: if rs not in unique_sentences: unique_sentences.append(rs)
def computeGraph(self, counterFilter, tfidfFilter): logger.info('getGraph') # First list all Entities and convert them into Nodes allNodes = [] for doc in self.docList.docs: for entity in doc.entities: n = Node(entity.name, entity.type) n.frequency = entity.tfidf allNodes.append(n) if not entity.type in self.types: self.types.append(entity.type) # Merge same Nodes together and aggregate the TFIDF print "allNodes len: " + str(len(allNodes)) nodesDict = {} for node in allNodes: key = node.name + node.type if key not in nodesDict: nodesDict[key] = node else: #Aggregate nodesDict[key].counter += 1 nodesDict[key].frequency += node.frequency print "Nbr of unique Nodes: " + str(len(nodesDict)) #Compute the average TFIDF frequencies = [] counters = [] for key, node in nodesDict.iteritems(): node.frequency = int(100000 * (node.frequency / node.counter)) nodesDict[key] = node frequencies.append(node.frequency) counters.append(node.counter) # Take only the best nodes frequencies.sort() # sorted by ascending order counters.sort() # sorted by ascending order numberOfNode = 40 print "len nodesDict " + str(len(nodesDict)) print "min freq" + str(frequencies[-numberOfNode]) print "min counter" + str(counters[-numberOfNode]) for key, node in nodesDict.items(): '''if node.frequency < frequencies[-numberOfNode]: del nodesDict[key] if node.counter < counters[-15]: del nodesDict[key]''' if node.counter < counterFilter or node.frequency < frequencies[ -min(tfidfFilter, len(frequencies) - 1)]: del nodesDict[key] print "len nodesDict filtered" + str(len(nodesDict)) # Set node's id i = 0 IdToPosition = {} for key, node in nodesDict.iteritems(): nodesDict[ key].id = i # TODO can be optimized and node just before in 'uniqueNodes.append(node)' nodesDict[key].id = abs(hash(key)) % (10**8) IdToPosition[nodesDict[key].id] = i i += 1 #Todo Update rank and frequency of Nodes ... #Create links with Weight linksDict = {} for doc in self.docList.docs: for i in range(len(doc.entities)): key1 = doc.entities[i].name + doc.entities[i].type if not key1 in nodesDict: # Entity not selected continue for j in range(i + 1, len(doc.entities)): key2 = doc.entities[j].name + doc.entities[j].type if not key2 in nodesDict: # Entity not selected continue if nodesDict[key1] < nodesDict[key2]: if (nodesDict[key1].id, nodesDict[key2].id) in linksDict: linksDict[(nodesDict[key1].id, nodesDict[key2].id)][0] += 1 linksDict[(nodesDict[key1].id, nodesDict[key2].id)][1].append(doc.id) else: linksDict[(nodesDict[key1].id, nodesDict[key2].id)] = [1, [doc.id]] else: if (nodesDict[key2], nodesDict[key1]) in linksDict: linksDict[(nodesDict[key2].id, nodesDict[key1].id)][0] += 1 linksDict[(nodesDict[key2].id, nodesDict[key1].id)][0].append(doc.id) else: linksDict[(nodesDict[key2].id, nodesDict[key1].id)] = [1, [doc.id]] pass print "linksDict len: " + str(len(linksDict)) adjacency = np.zeros( (len(nodesDict), len(nodesDict))) # Adjacency matrix links = [] for k, link in linksDict.viewitems(): if not k[0] == k[1]: links.append(Link(k[0], k[1], link[0], link[1])) adjacency[IdToPosition[k[0]]][IdToPosition[k[1]]] = link[0] adjacency[IdToPosition[k[1]]][IdToPosition[k[0]]] = link[0] print "links len: " + str(len(links)) pr = pageRank(adjacency, .85, .000001) for key, node in nodesDict.iteritems(): nodesDict[key].rank = pr[IdToPosition[nodesDict[key].id]] # Keep only link with weight >= 2 '''entitiesToKeep = [] for link in links: if link.weight <= 1: links = [x for x in links if x != link] #links.remove(link) else: entitiesToKeep.append(link.source) entitiesToKeep.append(link.target) # Keep only entity connected for key, node in nodesDict.items(): if not node.id in entitiesToKeep: del nodesDict[key]''' self.nodes = nodesDict.values() self.links = links
sent_outfile = open('sent_proportions.txt','w') sent_proportions, document_proportions, corpus_proportions = getTopicProportionsForCorpus('C:\Users\madhura\Desktop\lda_modelling\d30007t',word_props) for doc_sentences in sent_proportions: for k,v in doc_sentences.iteritems(): sent_outfile.write(k.replace('\n','')+"\t"+'\t'.join(str(x) for x in v) + "\n") sent_entities = [] i = 0 ranked_finals = '' entityDict, sent_entity = getSentenceEntityList('sent_proportions.txt') #for k,v in sent_entity.iteritems(): # print k,v # sent_entities.append(v) pageRankInput = inputforPageRank(sent_entity) raw_input() norm_pageRank = pageRank(pageRankInput,s=.86) ranked_sentences = [] #print norm_pageRank ranked_final = rankbyPageRank(norm_pageRank) for every_el in ranked_final: ranked_sentences.append(every_el[0]) #ranked_sentences = set(ranked_sentences) #print ranked_sentences #raw_input() unique_sentences = [] for rs in ranked_sentences: if rs not in unique_sentences: unique_sentences.append(rs)