Esempio n. 1
0
File: crawl.py Progetto: noxerit/cms
from Frontier import Frontier
from PageRanker import PageRanker
from Indexer import Indexer
from Searcher import Searcher
import re

frontier = Frontier()
pageRanker = PageRanker()
indexer = Indexer()

seedDocuments = [
    'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html',
    'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html',
    'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html'
]

def printWebGraph(webGraph):
    print
    print '-*( Web Graph )*-'
    print
    for entry in sorted(webGraph.keys()):
        print entry + ' -> ' + ', '.join(webGraph[entry])

def printIndex(index):
    print
    print '-*( Indices )*-'
    print
    for term,occurences in sorted(index.iteritems()):
        print '(' + term[0] + ', df:' + str(term[1]) + ') ->',
        print re.sub('(u)?\'', '', str(occurences))
Esempio n. 2
0
File: crawl.py Progetto: noxerit/cms
from Frontier import Frontier
from PageRanker import PageRanker
from Indexer import Indexer
from Searcher import Searcher
import re

frontier = Frontier()
pageRanker = PageRanker()
indexer = Indexer()

seedDocuments = [
    'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html',
    'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html',
    'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html'
]


def printWebGraph(webGraph):
    print
    print '-*( Web Graph )*-'
    print
    for entry in sorted(webGraph.keys()):
        print entry + ' -> ' + ', '.join(webGraph[entry])


def printIndex(index):
    print
    print '-*( Indices )*-'
    print
    for term, occurences in sorted(index.iteritems()):
        print '(' + term[0] + ', df:' + str(term[1]) + ') ->',
        value = pairSplit[1].strip()
        query[ field] = value

quotes = {}
quotedBy = {}
similars = {}
acordaos = {} 

def mergeDictsSets( h1, h2):
    for k in h2:
        if k in h1:
            h1[k] = h1[k].union( h2[k])
    return h1
try:
    graphMaker = GraphMaker( dbName, collectionInName, collectionOutName)
    pageRanker = PageRanker()
    tini = t1 = datetime.now()
    [acordaos, quotes, quotedBy, similars] = graphMaker.buildDicts( query)
    with open('graphPageRankingLog', 'a') as f:
        f.write( "build dicts time %d\n" % (datetime.now() - t1).seconds)
    #pageRanks = pageRanker.calculatePageRanks( acordaos, quotes, quotedBy, pageRankMode)
    t1 = datetime.now()
    [quotes, quotedBy] = graphMaker.removeInvalidAcordaosFromDicts( acordaos, quotes, quotedBy)
    with open('graphPageRankingLog', 'a') as f:
        f.write("remove invalid acordaos from dicts %d\n" % (datetime.now() - t1).seconds)
    t1 = datetime.now()
    quotesPlusSimilars = mergeDictsSets( quotes, similars) 
    quotedByPlusSimilars = mergeDictsSets( quotedBy, similars) 
    with open('graphPageRankingLog', 'a') as f:
        f.write("merge quotes with similars %d\n" % (datetime.now() - t1).seconds)
    t1 = datetime.now()