Exemple #1
0
def whooshOpen(query):
    ix = open_dir("../lab3/indexdir")

    results_dict = {}

    query = QueryParser('content', ix.schema).parse(query)
    with ix.searcher(weighting=scoring.TF_IDF()) as s_tf:
        tf_results = s_tf.search(query, limit=100)
    for r in tf_results:
        results_dict.setdefault(r.docnum, []).append(r.score)

    with ix.searcher(weighting=scoring.BM25F()) as s_bm:
        bm_results = s_bm.search(query, limit=100)
        for r in bm_results:
            results_dict.setdefault(r.docnum, []).append(r.score)

    ranks = pageRank.pageRank(pageRank.inverted_index, 0.85, 10)

    l = []
    for (id, vals) in results_dict.iteritems():
        if len(vals) == 2:
            l.append((vals[0], vals[1], ranks[id]))

    expected = start()

    ys = []
    for (tf, bm, pr) in l:
        ys.append(bm * expected[0] + tf * expected[1] + pr * expected[2] +
                  expected[3])

    print ys
Exemple #2
0
def plot(filename, minBeta, maxBeta, step):
    graph = pR.readGraph(filename)
    N = graph.getSize()
    betas = []
    iterations = []
    ranks = [[] for i in range(N)]
    while minBeta <= maxBeta:
        betas.append(minBeta)
        rks, its = pR.pageRank(graph, minBeta)
        iterations.append(its)
        for n in range(N):

            ranks[n].append(rks[n][-1])
        minBeta += step
    fig, axis = plt.subplots(1, 2)
    #plt.figure()
    ax = axis[0]
    ax.errorbar(betas, iterations, fmt='o--', color='b', capthick=2)
    ax.set_title("PageRank #iterations \nfor graph " + filename.split(".")[0])
    ax.set_xlabel("Beta values")
    ax.set_ylabel("Iterations")
    ax = axis[1]
    for n in range(N):
        ax.plot(betas, ranks[n], label="Node " + str(n + 1))
    ax.set_title("PageRank values \nfor graph " + filename.split(".")[0])
    ax.set_xlabel("Beta values")
    ax.set_ylabel("PageRank values")
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    fig.tight_layout()
    plt.savefig(filename.split(".")[0] + "_plot.png",
                bbox_inches='tight',
                dpi=100)
    plt.show()
def calculatePageRank(expansionMeshWordsCollect, progressControl, MeshTreeField):
    import pageRank
    meshTreeCodeDict = loadMeshTreeCode(MeshTreeField)
    #queryId = '20'
    meshTermAndCodeDictAll = {}
    meshTermPageRankValueAll = {}
    for queryId in expansionMeshWordsCollect:
        meshTermAndCodeDictAll[queryId] = {}
        meshTermPageRankValueAll[queryId] = {}
        #meshTermAndCodeDict = {}
        meshTermId = 0
        for word in expansionMeshWordsCollect[queryId]:
            meshTermAndCodeDictAll[queryId][meshTermId] = [word, meshTreeCodeDict[word]]
            meshTermId += 1
        meshTermId = 0
        meshTermCount = len(meshTermAndCodeDictAll[queryId])
        #support.printDict(meshTermAndCodeDict, 1)
        S = probMatrix(meshTermCount, meshTermAndCodeDictAll[queryId], progressControl)
        f = calculateOriginalPangRankValue(meshTermCount, meshTermAndCodeDictAll[queryId], expansionMeshWordsCollect[queryId])
        U = [[1]*meshTermCount for row in range(meshTermCount)]
        n = meshTermCount
        alpha=1.0
        pageRankValue = pageRank.pageRank(S, U, f, alpha, n)
        
        #print pageRankValue
        max = np.max(pageRankValue)
        for i in range(meshTermCount):
            word = meshTermAndCodeDictAll[queryId][i][0]
            #code = meshTermAndCodeDict[i][1]
            wordPageRankvalue = pageRankValue[i]/max
            #print queryId, word, wordPageRankvalue
            meshTermPageRankValueAll[queryId][word] = wordPageRankvalue
    #support.printDict(meshTermPageRankValueAll, 2)    
    return meshTermPageRankValueAll    
Exemple #4
0
def main():
    matriz_de_transicion, world_wide_web = crawler()

    tiempo_inicio = time.time()
    pagerank = pageRank(matriz_de_transicion)
    dt = {'Paginas': list(world_wide_web.values()), 'Rank': pagerank}
    print(emoji.emojize('Pagerank :thumbsup:', use_aliases=True))
    df = pd.DataFrame(data=dt)
    print(df)

    df.to_csv('Pagerank.csv', encoding='utf-8', index=False)
Exemple #5
0
def calculatePageRank(expansionMeshWordsCollect, progressControl,
                      MeshTreeField):
    import pageRank
    meshTreeCodeDict = loadMeshTreeCode(MeshTreeField)
    #queryId = '20'
    meshTermAndCodeDictAll = {}
    meshTermPageRankValueAll = {}
    for queryId in expansionMeshWordsCollect:
        meshTermAndCodeDictAll[queryId] = {}
        meshTermPageRankValueAll[queryId] = {}
        #meshTermAndCodeDict = {}
        meshTermId = 0
        for word in expansionMeshWordsCollect[queryId]:
            meshTermAndCodeDictAll[queryId][meshTermId] = [
                word, meshTreeCodeDict[word]
            ]
            meshTermId += 1
        meshTermId = 0
        meshTermCount = len(meshTermAndCodeDictAll[queryId])
        #support.printDict(meshTermAndCodeDict, 1)
        S = probMatrix(meshTermCount, meshTermAndCodeDictAll[queryId],
                       progressControl)
        f = calculateOriginalPangRankValue(meshTermCount,
                                           meshTermAndCodeDictAll[queryId],
                                           expansionMeshWordsCollect[queryId])
        U = [[1] * meshTermCount for row in range(meshTermCount)]
        n = meshTermCount
        alpha = 1.0
        pageRankValue = pageRank.pageRank(S, U, f, alpha, n)

        #print pageRankValue
        max = np.max(pageRankValue)
        for i in range(meshTermCount):
            word = meshTermAndCodeDictAll[queryId][i][0]
            #code = meshTermAndCodeDict[i][1]
            wordPageRankvalue = pageRankValue[i] / max
            #print queryId, word, wordPageRankvalue
            meshTermPageRankValueAll[queryId][word] = wordPageRankvalue
    #support.printDict(meshTermPageRankValueAll, 2)
    return meshTermPageRankValueAll
Exemple #6
0
#!/usr/bin/python

import os.path
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.qparser import *
import pageRank



ranks = pageRank.pageRank(pageRank.inverted_index, 0.85, 10)

ix = open_dir("../lab3/indexdir")
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema, group=OrGroup).parse(u"first document")
    results = searcher.search(query, limit=100)

    arr = [(r['id'], ranks[r['id']]*results.score(i)) \
                    for (i,r) in enumerate(results) if r['id'] in ranks.keys()]
    final = sorted(arr, key=lambda (_, val): val, reverse=True)

    for r in final:
        print ">", r
Exemple #7
0
links = [[]]


def read_file(filename):
    f = open(filename, 'r')
    for line in f:
        (frm, to) = map(int, line.split(" "))
        extend = max(frm - len(links), to - len(links)) + 1
        for i in range(extend):
            links.append([])
        links[frm].append(to)
    f.close()


fn = "1000.txt"
read_file(fn)

f = open("time-%s" % fn, 'w')
for i in range(5):
    start = int(round(time.time() * 1000))
    pr = pageRank(links, alpha=0.85, convergence=0.00001, checkSteps=10)
    used = int(round(time.time() * 1000)) - start
    f.writelines(["no.%d time used: %s ms\n" % (i, used)])
f.close()

# sum = 0
# for i in range(len(pr)):
#     print i, "=", pr[i]
#     sum = sum + pr[i]
# print "s = " + str(sum)
Exemple #8
0
#  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
#  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
#  OF THE POSSIBILITY OF SUCH DAMAGE.

import sys
from pageRank import pageRank

links = [[]]

def read_file(filename):
    f = open(filename, 'r')
    for line in f:
        (frm, to) = map(int, line.split(" "))
        extend = max(frm - len(links), to - len(links)) + 1
        for i in range(extend):
            links.append([])
        links[frm].append(to)
    f.close()

read_file(sys.argv[1])

pr =  pageRank(links, alpha=0.85, convergence=0.00001, checkSteps=10)
sum = 0
for i in range(len(pr)):
    print i, "=", pr[i]
    sum = sum + pr[i]
print "s = " + str(sum)


Exemple #9
0
  def computeGraph(self, counterFilter, tfidfFilter):
    logger.info('getGraph')
    
    # First list all Entities and convert them into Nodes
    allNodes = []
    for doc in self.docList.docs:
      for entity in doc.entities:
        n = Node(entity.name, entity.type)
        n.frequency = entity.tfidf
        allNodes.append(n)
        
        if not entity.type in self.types:
          self.types.append(entity.type)
				
    # Merge same Nodes together and aggregate the TFIDF
    print "allNodes len: " + str(len(allNodes))
    nodesDict = {}
    for node in allNodes:
        key = node.name + node.type
        if key not in nodesDict:
          nodesDict[key] = node
        else: #Aggregate
          nodesDict[key].counter += 1
          nodesDict[key].frequency += node.frequency
    print "Nbr of unique Nodes: " + str(len(nodesDict))
    
    #Compute the average TFIDF
    frequencies = []
    counters = []
    for key, node in nodesDict.iteritems():
      node.frequency = int(100000*(node.frequency/node.counter))
      nodesDict[key] = node
      frequencies.append(node.frequency)
      counters.append(node.counter)

    # Take only the best nodes
    frequencies.sort() # sorted by ascending order
    counters.sort() # sorted by ascending order
    numberOfNode = 40
    print "len nodesDict " + str(len(nodesDict))
    print "min freq" + str(frequencies[-numberOfNode])
    print "min counter" + str(counters[-numberOfNode])
    for key, node in nodesDict.items():
      '''if node.frequency < frequencies[-numberOfNode]:
        del nodesDict[key]
      if node.counter < counters[-15]:
        del nodesDict[key]'''
      if node.counter < counterFilter or node.frequency < frequencies[-min(tfidfFilter,len(frequencies)-1)]:
        del nodesDict[key]
    print "len nodesDict filtered" + str(len(nodesDict))
    
    # Set node's id
    i = 0
    IdToPosition = {}
    for key, node in nodesDict.iteritems():
      nodesDict[key].id = i # TODO can be optimized and node just before in 'uniqueNodes.append(node)'
      nodesDict[key].id = abs(hash(key)) % (10 ** 8)
      IdToPosition[ nodesDict[key].id ] = i
      i += 1
      
    #Todo Update rank and frequency of Nodes ...
    
    #Create links with Weight
    linksDict = {}
    for doc in self.docList.docs:
      for i in range(len(doc.entities)):
        key1 = doc.entities[i].name + doc.entities[i].type
        if not key1 in nodesDict: # Entity not selected
          continue
        for j in range(i+1, len(doc.entities)):
          key2 = doc.entities[j].name + doc.entities[j].type
          if not key2 in nodesDict: # Entity not selected
            continue
          if nodesDict[key1] < nodesDict[key2]:
            if (nodesDict[key1].id,nodesDict[key2].id) in linksDict:
              linksDict[(nodesDict[key1].id,nodesDict[key2].id)][0] += 1
              linksDict[(nodesDict[key1].id,nodesDict[key2].id)][1].append(doc.id)
            else:
              linksDict[(nodesDict[key1].id,nodesDict[key2].id)] = [1,[doc.id]]
          else:
            if (nodesDict[key2],nodesDict[key1]) in linksDict:
              linksDict[(nodesDict[key2].id,nodesDict[key1].id)][0] += 1
              linksDict[(nodesDict[key2].id,nodesDict[key1].id)][0].append(doc.id)
            else:
              linksDict[(nodesDict[key2].id,nodesDict[key1].id)] = [1,[doc.id]]
          pass

    print "linksDict len: " + str(len(linksDict))
        
    adjacency = np.zeros((len(nodesDict), len(nodesDict)))# Adjacency matrix
    links = []
    for k, link in linksDict.viewitems():
      if not k[0] == k[1]:
        links.append(Link(k[0],k[1],link[0],link[1]))
        adjacency[IdToPosition[k[0]]][IdToPosition[k[1]]] = link[0]
        adjacency[IdToPosition[k[1]]][IdToPosition[k[0]]] = link[0]
    
    print "links len: " + str(len(links))
    
    pr = pageRank(adjacency, .85, .000001)
    for key, node in nodesDict.iteritems():
      nodesDict[key].rank = pr[IdToPosition[nodesDict[key].id]]  
    
    # Keep only link with weight >= 2
    '''entitiesToKeep = []
    for link in links:
      if link.weight <= 1:
        links = [x for x in links if x != link]
        #links.remove(link)
      else:
        entitiesToKeep.append(link.source)
        entitiesToKeep.append(link.target)
        
    # Keep only entity connected
    for key, node in nodesDict.items():
      if not node.id in entitiesToKeep:
        del nodesDict[key]'''
        
    self.nodes = nodesDict.values()
    self.links = links
Exemple #10
0
#seeds
url_list = ['http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html']
url_list.append('http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html')
url_list.append('http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html')

crawl = Crawler(url_list)

def activate_crawl():

    if crawl.crawl_complete:
        print('website crawl complete')

    else:
        crawl.downloader()
        crawl.parser()
        crawl.frontier()

        #crawl feedback
        print('link temporary:\t\t', sorted(crawl.link_temporary))
        print('link set:\t\t\t', sorted(crawl.link_set))
        print('page_rank_graph:\t', crawl.page_rank_graph)
        print('not crawled:\t\t',crawl.url_seed)
        print('NEW RUN ++++++++++++++++++++++++++++++++++++++')

        activate_crawl()

activate_crawl()

page_rank = pageRank(crawl.page_rank_graph)
print(page_rank.returnGraph())
print('number_of_sites:\t\t', page_rank.calc_number_of_sites())
    for doc_sentences in sent_proportions:
        for k, v in doc_sentences.iteritems():
            sent_outfile.write(
                k.replace('\n', '') + "\t" + '\t'.join(str(x)
                                                       for x in v) + "\n")
    sent_entities = []
    i = 0
    ranked_finals = ''
    entityDict, sent_entity = getSentenceEntityList('sent_proportions.txt')
    #for k,v in sent_entity.iteritems():
    #	print k,v
    #	sent_entities.append(v)
    pageRankInput = inputforPageRank(sent_entity)
    raw_input()
    norm_pageRank = pageRank(pageRankInput, s=.86)
    ranked_sentences = []
    #print norm_pageRank
    ranked_final = rankbyPageRank(norm_pageRank)
    for every_el in ranked_final:
        ranked_sentences.append(every_el[0])
    #ranked_sentences = set(ranked_sentences)

    #print ranked_sentences
    #raw_input()
    unique_sentences = []

    for rs in ranked_sentences:
        if rs not in unique_sentences:
            unique_sentences.append(rs)
Exemple #12
0
    def computeGraph(self, counterFilter, tfidfFilter):
        logger.info('getGraph')

        # First list all Entities and convert them into Nodes
        allNodes = []
        for doc in self.docList.docs:
            for entity in doc.entities:
                n = Node(entity.name, entity.type)
                n.frequency = entity.tfidf
                allNodes.append(n)

                if not entity.type in self.types:
                    self.types.append(entity.type)

        # Merge same Nodes together and aggregate the TFIDF
        print "allNodes len: " + str(len(allNodes))
        nodesDict = {}
        for node in allNodes:
            key = node.name + node.type
            if key not in nodesDict:
                nodesDict[key] = node
            else:  #Aggregate
                nodesDict[key].counter += 1
                nodesDict[key].frequency += node.frequency
        print "Nbr of unique Nodes: " + str(len(nodesDict))

        #Compute the average TFIDF
        frequencies = []
        counters = []
        for key, node in nodesDict.iteritems():
            node.frequency = int(100000 * (node.frequency / node.counter))
            nodesDict[key] = node
            frequencies.append(node.frequency)
            counters.append(node.counter)

        # Take only the best nodes
        frequencies.sort()  # sorted by ascending order
        counters.sort()  # sorted by ascending order
        numberOfNode = 40
        print "len nodesDict " + str(len(nodesDict))
        print "min freq" + str(frequencies[-numberOfNode])
        print "min counter" + str(counters[-numberOfNode])
        for key, node in nodesDict.items():
            '''if node.frequency < frequencies[-numberOfNode]:
        del nodesDict[key]
      if node.counter < counters[-15]:
        del nodesDict[key]'''
            if node.counter < counterFilter or node.frequency < frequencies[
                    -min(tfidfFilter,
                         len(frequencies) - 1)]:
                del nodesDict[key]
        print "len nodesDict filtered" + str(len(nodesDict))

        # Set node's id
        i = 0
        IdToPosition = {}
        for key, node in nodesDict.iteritems():
            nodesDict[
                key].id = i  # TODO can be optimized and node just before in 'uniqueNodes.append(node)'
            nodesDict[key].id = abs(hash(key)) % (10**8)
            IdToPosition[nodesDict[key].id] = i
            i += 1

        #Todo Update rank and frequency of Nodes ...

        #Create links with Weight
        linksDict = {}
        for doc in self.docList.docs:
            for i in range(len(doc.entities)):
                key1 = doc.entities[i].name + doc.entities[i].type
                if not key1 in nodesDict:  # Entity not selected
                    continue
                for j in range(i + 1, len(doc.entities)):
                    key2 = doc.entities[j].name + doc.entities[j].type
                    if not key2 in nodesDict:  # Entity not selected
                        continue
                    if nodesDict[key1] < nodesDict[key2]:
                        if (nodesDict[key1].id,
                                nodesDict[key2].id) in linksDict:
                            linksDict[(nodesDict[key1].id,
                                       nodesDict[key2].id)][0] += 1
                            linksDict[(nodesDict[key1].id,
                                       nodesDict[key2].id)][1].append(doc.id)
                        else:
                            linksDict[(nodesDict[key1].id,
                                       nodesDict[key2].id)] = [1, [doc.id]]
                    else:
                        if (nodesDict[key2], nodesDict[key1]) in linksDict:
                            linksDict[(nodesDict[key2].id,
                                       nodesDict[key1].id)][0] += 1
                            linksDict[(nodesDict[key2].id,
                                       nodesDict[key1].id)][0].append(doc.id)
                        else:
                            linksDict[(nodesDict[key2].id,
                                       nodesDict[key1].id)] = [1, [doc.id]]
                    pass

        print "linksDict len: " + str(len(linksDict))

        adjacency = np.zeros(
            (len(nodesDict), len(nodesDict)))  # Adjacency matrix
        links = []
        for k, link in linksDict.viewitems():
            if not k[0] == k[1]:
                links.append(Link(k[0], k[1], link[0], link[1]))
                adjacency[IdToPosition[k[0]]][IdToPosition[k[1]]] = link[0]
                adjacency[IdToPosition[k[1]]][IdToPosition[k[0]]] = link[0]

        print "links len: " + str(len(links))

        pr = pageRank(adjacency, .85, .000001)
        for key, node in nodesDict.iteritems():
            nodesDict[key].rank = pr[IdToPosition[nodesDict[key].id]]

        # Keep only link with weight >= 2
        '''entitiesToKeep = []
    for link in links:
      if link.weight <= 1:
        links = [x for x in links if x != link]
        #links.remove(link)
      else:
        entitiesToKeep.append(link.source)
        entitiesToKeep.append(link.target)
        
    # Keep only entity connected
    for key, node in nodesDict.items():
      if not node.id in entitiesToKeep:
        del nodesDict[key]'''

        self.nodes = nodesDict.values()
        self.links = links
	sent_outfile = open('sent_proportions.txt','w')
	sent_proportions, document_proportions, corpus_proportions = getTopicProportionsForCorpus('C:\Users\madhura\Desktop\lda_modelling\d30007t',word_props)
	
	for doc_sentences in sent_proportions:
		for k,v in doc_sentences.iteritems():
			sent_outfile.write(k.replace('\n','')+"\t"+'\t'.join(str(x) for x in v) + "\n")
	sent_entities = []
	i = 0
	ranked_finals = ''
	entityDict, sent_entity = getSentenceEntityList('sent_proportions.txt')
	#for k,v in sent_entity.iteritems():
	#	print k,v
	#	sent_entities.append(v)
	pageRankInput = inputforPageRank(sent_entity)
	raw_input()
	norm_pageRank = pageRank(pageRankInput,s=.86)
	ranked_sentences = []
	#print norm_pageRank
	ranked_final = rankbyPageRank(norm_pageRank)
	for every_el in ranked_final:
		ranked_sentences.append(every_el[0])
	#ranked_sentences = set(ranked_sentences)
	
	#print ranked_sentences
	#raw_input()
	unique_sentences = []
	
	for rs in ranked_sentences:
		if rs not in unique_sentences:
			unique_sentences.append(rs)