/
feature_extracter.py
77 lines (60 loc) · 2.13 KB
/
feature_extracter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from bs4 import BeautifulSoup as BS
from nltk.stem import WordNetLemmatizer
from nltk import tokenize
import string
def word_counts(html):
lem = WordNetLemmatizer()
tokens = tokenize(BS(html).get_text())
tokens = [lem.lemmatizer(token.lowercase) for token in tokens if token not in string.punctuation]
return make_dict(tokens)
def make_dict(tokens):
dct = {}
for token in tokens:
if token not in dct.keys():
dct[token] = 0
dct[token] += 1
return dct
def link_count(html):
soup = BS(html)
return len(soup.find_all('a'))
def img_count(html):
soup = BS(html)
return len(soup.find_all('img'))
def junk_count(html):
tokens = tokenize(BS(html).get_text())
tokens = [token for token in tokens if not token.isalpha() and token not in string.punctuation]
return len(tokens)
def word_count(html):
tokens = tokenize(BS(html).get_text())
return len([token for token in tokens if token not in string.punctuation])
def doc_length(html):
tokens = tokenize(BS(html).get_text())
return sum([len(token) for token in tokens if token not in string.punctuation])
def longest_word(html):
tokens = tokenize(BS(html).get_text())
return max([len(token) for token in tokens])
def ave_word_length(html):
count = word_count(html)
if count > 0:
return doc_length(html)/count
return 0
def punc_count(html):
tokens = tokenize(BS(html.get_text()))
count = 0
for token in tokens:
if len(token) > 1:
for char in token:
if char in string.punctuation:
count +=1
break
return count
def test(html):
print("Word counts:", word_counts(html))
print("Link count:", link_count(html))
print("Img count:", img_count(html))
print("Junk count:", junk_count(html))
print("Total word count:", word_count(html))
print("Doc length", doc_length(html))
print("Longest Word:", longest_word(html))
print("Ave word length:", ave_word_length(html))
print("Punc count:", punc_count(html))