/
HTMLParser.py
88 lines (74 loc) · 4.39 KB
/
HTMLParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from bs4 import BeautifulSoup
from zipfile import ZipFile
# List of stop words to remove as preprocessing, this word list was sourced from https://gist.github.com/sebleier/554280
stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
def buildHashMap(zipped_folder):
'''
This function will take in a zipped folder of html pages and iterate through them, extracting all words from the pre-processed html text.
It will output a dictionary of words occuring the collection of pages.
The dictionary keys are words that contain a list of occurrences and locations of where the terms are found in the documents.
'''
with ZipFile(zipped_folder, 'r') as zip:
# Display the directory of files in the zip folder
zip.printdir()
word_list = {}
# Loop through each file in the zip container
for name in zip.namelist():
# Read the html file
data = zip.read(name)
# Parse the data with BeautifulSoup
soup = BeautifulSoup(data, 'html.parser')
# Extract the text from the html file and generate a list of "words" separated by spaces
page_text = soup.getText().split()
# Loop through words in extracted text and preprocess to exclude prohibited words
doc_words = {}
for id, word in enumerate(page_text):
# Check if word contains only letters
if word.isalpha():
# set word to lowercase
word = word.lower()
# Map frequencies and occurrences for each word in the corpus
if word in doc_words.keys():
doc_words[word]['freq'] += 1
doc_words[word]['locations'].append(id)
else:
doc_words[word] = {
'doc_id': name,
'freq': 1,
'tf_idf': 0,
'locations': [id]
}
# Add document words to corpus vocabulary
for word in doc_words.keys():
if word in word_list.keys():
word_list[word].append(doc_words[word])
else:
word_list[word] = [doc_words[word]]
doc_words = {}
return word_list
def runSearch(corpus):
'''
The following function takes in user input and searches a hashed map of vocabulary built from a corpus of html web pages.
The search will return all documents where the term occurred.
The search will terminate when the user input is empty.
'''
print('Beginning Search. Press Enter with no search term to exit at any time.')
running = True
while running:
# Request user input
res = input('enter a search key=> ')
if res == '':
print('Ending Search. Bye!')
running = False
# Error handling for non-words or special character terms (they don't appear in the vocabulary)
elif not res.isalpha():
print('Your search term must be a word! No special characters or numbers, please.')
else:
try:
hits = corpus[res]
print("Found a match in the following pages: ")
[print(hit['doc_id']) for hit in hits]
except KeyError:
print('Your search returned no results')
corpus = buildHashMap('Jan.zip')
runSearch(corpus)