/
indexing.py
106 lines (88 loc) · 3.1 KB
/
indexing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#! /usr/bin/python
#
# Build a direct and an inverted index from the documents contained in a list
# (or, equivalently, yielded by an iterator)
#
# Test code: build the direct and inverted index of a set of downloaded documents
#
# Example:
# python indexing.py http://en.wikipedia.org/wiki/Business_intelligence 100
#############################################################
import math
import parser
# Create the direct index
def get_index (document_list):
index = []
# As long as there are unvisited URLs (and a maximum of MAX_VISITED downloads)
for document in document_list:
# Take terms from the text
document['terms'] = parser.extract_terms(document['text'])
# Now document text is not needed anymore
del document['text']
# The document ID is its position within the direct index
docid = document['id'] = len(index)
index.append (document)
return index
# Return all terms with number of occurrences
def terms_with_count (terms):
count = {}
for term in terms:
if term in count:
count[term] += 1
else:
count[term] = 1
return count
# Invert the direct index
def invert_index (direct_index):
# inverted index of the document corpus.
# The index maps each term to a list of triplets
# (doc_index, occurrences, term_frequency)
inverted_index = {}
# As long as there are unvisited URLs and a maximum of MAX_VISITED downloads
for document in direct_index:
docid = document['id']
terms = document['terms']
count = terms_with_count (terms)
# Add the temporary count to the global inverted index
for t, c in count.items():
tf = float(c)/len(terms)
if t in inverted_index:
inverted_index[t]['occurrences'].append ((docid, c, tf))
else:
inverted_index[t] = {
# The document ID is its position within the inverted index
'termid': len(inverted_index),
'occurrences': [(docid, c, tf)]
}
# Now add the IDF of each term in its corresponding inverted index entry
n_documents = float(len(direct_index))
for t, l in inverted_index.items():
# Compute the term's IDF
idf = math.log (n_documents / len(l['occurrences']))
if idf > 0:
l['idf'] = idf
else:
# If IDF == 0, remove term from inverted index
del inverted_index[t]
return inverted_index
# Given an inverted index, compute the IDF threshold value such that a given fraction
# of terms is above that level
def IDF_threshold (inverted_index, term_fraction = .99):
# Sort all IDFs by ascending value
idf_list = sorted([l['idf'] for l in inverted_index.values()])
# Find the appropriate position in the list
threshold_index = int(len(idf_list)*(1-term_fraction))
# Return the value in the sorted list
return idf_list[threshold_index]
if __name__ == '__main__':
import sys
import crawler
seed = sys.argv[1]
n_pages = int(sys.argv[2])
print 'Downloading and indexing documents...'
direct_index = get_index (crawler.recursive_download (seed, n_pages))
print len(direct_index), ' documents downloaded.'
print 'Now computing the inverted index...'
inverted_index = invert_index (direct_index)
print len(inverted_index), 'terms indexed.'
print '99% IDF threshold is', IDF_threshold (inverted_index, .99)