/
text_mining.py
137 lines (119 loc) · 4.07 KB
/
text_mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Recieves a link from wikipedia (or just the title of it) and
finds how far away it is from Philosophy by only clicking the
first link in the page (without counting the dismabiguations)
28/02/2020
@Nabih Estefan Diaz
"""
import doctest
import requests
import sys
import os.path
import string
from bs4 import BeautifulSoup
base = 'https://en.wikipedia.org'
philosophy = base + '/wiki/Philosophy'
def findFirst(link):
"""
Recieves link to Wikipedia webpage
Finds and returns string representaing first link in Webpage
"""
#gets to the content part of the html where we'll find link
info = BeautifulSoup(requests.get(link).text, 'html.parser')
info = info.body.find(id="content").find(id="bodyContent")
info = info.find(id="mw-content-text").div
info = info.next
#check to cycle through content till we find first lnk
while (info == '\n' or info.has_attr('class') or info.name == 'style'):
info = info.next_sibling
"""
prints used of debugging
print(link)
print('temp')
print(temp)
print('temp.a')
print(temp.a)
print('\nhref')
print(temp.a['href'])
"""
#Fix problem if first paragraph has no link
while True:
try:
newLink = base + info.a['href']
break
except TypeError:
info = info.next_sibling
while (info == '\n' or info.has_attr('class') or info.name == 'style'):
info = info.next_sibling
break
#Fixed problem with coordinates Coordinates
while True:
if newLink == "https://en.wikipedia.org/wiki/Geographic_coordinate_system":
info = info.next_sibling
while (info == '\n' or info.has_attr('class') or info.name == 'style'):
info = info.next_sibling
newLink = base + info.a['href']
else:
break
#Fix problem with Citations
while True:
if info.a.text == "[1]":
info = info.next_sibling
while (info == '\n' or info.has_attr('class') or info.name == 'style'):
info = info.next_sibling
else:
break
newLink = base + info.a['href']
#print("\t\t" + newLink)
return newLink[24:]
def depth(link, file, steps):
"""
Recursive method designed to reach the end of the Philosophy chain
Base Case: checks if you are in philosophy
If yes returns 0, you are here, no more steps are needed
If no goes to recursive case
Recursive Case:
calls findFirst which returns a string that includes the first link in
the current Wiki page
uses this to return 1 + depth(newLink) which will eventually return
number of steps needed to reach philosphy
"""
if link == philosophy:
file.write("\t" + philosophy + "\n")
return 0
else:
newLink = base + findFirst(link)
file.write("\t" + link + "\n")
return 1 + depth(newLink, file, steps)
def main(args=sys.argv):
"""
Run the main logic of the script.
Args:
args: a link used to start the program
"""
#Find link for name given
name = args[1]
name.replace(" ", "_")
link = base + "/wiki/" + name
filename = "Philosophy Chain " + name +".txt"
print(filename)
#check if file already exists
if not os.path.isfile(filename):
#file was not found, we need to find the path
#open file, write first lines, call recursive function, call final lines
file = open(filename, "w+")
file.write("Starting at " + link + ", these are the steps to reach the Wikipedia Philosophy Webpage\n")
result = depth(link, file, 0)
file.write("\n\nThese were the %2d steps to reach the Wikipedia Philosophy Webpage" %result)
file.close()
#open file for read (either it was found or created)
#print the links to the user
with open(filename, "r") as file:
lines = file.readlines()
for line in lines:
print(line)
if __name__ == "__main__":
"""
Starts the program by calling our main function
"""
main()