-
Notifications
You must be signed in to change notification settings - Fork 0
/
__main__.py
63 lines (53 loc) · 1.73 KB
/
__main__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from frontier import Frontier
from parser import Parser
from graph import Graph
from pagerank import Ranker
from indexer import Indexer
from scorer import Scorer
frontier = Frontier([
'http://mysql12.f4.htw-berlin.de/crawl/d01.html',
'http://mysql12.f4.htw-berlin.de/crawl/d06.html',
'http://mysql12.f4.htw-berlin.de/crawl/d08.html'
])
parser = Parser()
indexer = Indexer()
web_graph = Graph()
for url in frontier:
# get outgoing links for the graph and content for tokenization
body, links_on_page = parser.parse(url)
# add document to indexer
indexer.add_document(url, body)
# build our webgraph
node = web_graph.get_node(url)
if node is None:
node = web_graph.add_node(url)
for out_link in links_on_page:
web_graph.add_edge(url, out_link)
# hand links to the frontier to make sure they are all crawled
frontier.add_urls(links_on_page)
# for node in web_graph:
# print(node)
#
# print()
ranker = Ranker(web_graph)
ranker.calculate_rank(curb_factor=0.95, delta=0.04)
print(ranker)
#
# for k in indexer.index:
# print(k, indexer.index[k])
# print(indexer.find('supervised'))
# query = ["index"]
scorer = Scorer(indexer, ranker)
queries = [['tokens'], ['index'], ['classification'], ['tokens', 'classification']]
for query in queries:
print(str(query) + ':')
print('non-weighted:')
score_list = scorer.cosine_score(query) # [(doc, score), (doc2, score2), ...]
for tupel in score_list:
print(tupel)
print("________________________________")
print('weighted with pagerank:')
score_list2 = scorer.weighted_score(query) #[(doc, score), (doc2, score2),...]
for tupel in score_list2:
print(tupel)
print("________________________________")