-
Notifications
You must be signed in to change notification settings - Fork 0
/
hyperlink_fetch.py
581 lines (414 loc) · 24.6 KB
/
hyperlink_fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
#hyperlink_fetch
import re
import text_manip
import urllib2
from bs4 import BeautifulSoup
import text_manip
from collections import defaultdict
'''
This code file, given a directory, a wikipedia page link and the depth you want to go, generates a file hierarchy rooted at the provided directory, with the saved webpage as well. You can just open this webpage and keep browsing deeper and deeper (until you hit the max depth which you have assigned).
'''
class HTML_page_Obj: #uses urllib2
'''
class members (usable after constuctor):
url : the page url
short_url : a shortened url (if possible to shorten)
html : the html of the webpage
charset : the charset used to encode the HTML
headers : a dictionary of the other miscellaneous headers which might or might not be present.
class functions:
__init__ : the constructor that sets all the above variables
all_hyperlinks : returns a dictionary of all hyperlinks in the html code
html_soup_and_prettify : prettifies the html by using BeautifulSoup.prettify()
get_response_object : internal method, not meant for external use
'''
def __init__(self, page_url):
response=None
self.url=None
try:
response=self.get_response_object(page_url)
except Exception:
try:
# time.sleep(1)
response=self.get_response_object(page_url)
except Exception:
try:
# time.sleep(5)
response=self.get_response_object(page_url)
except Exception:
print "\nTried 3 times. Cannot access url: %s. \nHence, cannot make HTML_page_Obj\n"%page_url
if response != None:
self.url=page_url
try:
self.short_url=response.headers.getparam('Link')
except Exception:
self.short_url=""
self.charset = response.headers.getparam('charset')
self.headers={'charset':self.charset}
for i in response.headers:
self.headers[i]=response.headers[i].split(';')[0]
self.html = response.read() #the actual html
self.html=text_manip.ensure_UTF8(self.html)
self.html_soup=None
self.link_dict=None
self.all_style=None
def get_response_object(self, page_url):
if 'http://' in page_url.lower():
response =urllib2.urlopen(page_url)
elif 'https://' in page_url.lower():
response=urllib2.urlopen('https://'+page_url.split("https://")[-1])
else:
response =urllib2.urlopen('http://'+page_url)
return response
def all_hyperlinks(self):
article_soup=BeautifulSoup(self.html)
self.link_dict={}
for link_tag in article_soup.find_all('a'):
self.link_dict[link_tag.contents[0].strip()]=link_tag.get('href')
# this makes a dict of the form {'article_headline':'article_hyperlink'}
return self.link_dict
def html_soup_and_prettify(self):
if self.html_soup==None:
self.html_soup=self.make_soup()
self.html=text_manip.ensure_UTF8(self.html_soup.prettify())
return self.html_soup
def make_soup(self):
if self.html_soup==None:
self.html_soup= BeautifulSoup(self.html)
return self.html_soup
def write_pretty_to_file(self, folderpath):
file_name = self.url.split('/')[-1]
self.html_soup_and_prettify()
with open(text_manip.make_file_path(folderpath , file_name, ".html") ) as output_html_file:
output_html_file.write(text_manip.ensure_ASCII (self.html) )
def get_external_styling(self):
css_soup=BeautifulSoup(self.html)
stylesheet_links=[]
self.all_style=""
for head_link_tag in css_soup.find_all('link', rel="stylesheet"):
stylesheet_links.append(head_link_tag.get('href'))
#Note: order of links is important, do not comprimise that
for link in stylesheet_links:
css_style_page=HTML_page_Obj(link)
self.all_style+=css_style_page.html +"\n"
return self.all_style
class Wiki_page_obj:
def __init__(self, wikipedia_url, just_heading=False):
# inherit items from the HTML_page_Obj:
self._page = HTML_page_Obj(wikipedia_url)
self.url=self._page.url
self.wiki_domain = text_manip.extract_website(self.url) #gets en.wikipedia.org, etc. in different languages
# make the soup of the HTML and get the HTML, prettified:
soup=self._page.html_soup_and_prettify()
self.html=self._page.html #prettified, in UTF-8 format
# Get the heading of the Wikipedia page:
self.heading =text_manip.ensure_UTF8(BeautifulSoup('''%s'''%soup.find("h1", id="firstHeading", class_="firstHeading")).get_text().strip())
if just_heading==False: # prevents a lot of unecessary processing if True
# Make a soup of the article html, set the article html:
article_soup=soup.find("div", id="mw-content-text") #might come in handy later
self.article_html=text_manip.ensure_UTF8(article_soup.prettify())
temp_art_html=self.article_html
temp_art_html=text_manip.remove_HTML_perfect(html=temp_art_html, tag="div", class_list=["reflist"])
temp_art_html=text_manip.remove_after(regex='<span.*?id="References"', text=temp_art_html)
temp_art_html=text_manip.remove_after(regex='<span.*?id="See_also"', text=temp_art_html)
temp_art_html=text_manip.remove_after(regex='<span.*?id="Further_reading"', text=temp_art_html)
temp_art_html=text_manip.remove_after(regex='<span.*?id="External_links"', text=temp_art_html)
article_soup=BeautifulSoup(temp_art_html)
# Get the links from the article HTML:
self.article_links=[] # real links, from article
self.img_links=[] # real direct links, from article
self.direct_img_to_img_map={}
self.media_links=[] # real links, from article
not_allowed_in_url_from_article=["/wiki/Special:", "/wiki/Template:", "/wiki/Portal:", "/wiki/Wikipedia:", "/wiki/Help:", "redlink=1"]
for link_tag in article_soup.find_all('a'):
link=link_tag.get('href')
if link is not None and link[:6]=="/wiki/": #only extracts links to wikipedia
allowed_flag=True
for allowed_test in not_allowed_in_url_from_article:
if allowed_test in link:
allowed_flag=False
break
if allowed_flag:
link=text_manip.ensure_UTF8(link)
mapped_link="https://"+self.wiki_domain+link
if link[:11] != "/wiki/File:":
self.article_links.append(mapped_link)
else :
if link[-4:]=='.ogg': #video or audio file
self.media_links.append(mapped_link)
else: # image file
img_tag=link_tag.img
# print "\n\n"
# print "link_tag :",link_tag
# print "\timg_tag :",img_tag
if img_tag is not None:
direct_img_link=img_tag.get('src')
# print "\t\tdirect_img_link :",direct_img_link
self.img_links.append(direct_img_link)
self.direct_img_to_img_map[direct_img_link]=mapped_link
self.article_links=list(set(self.article_links))
self.img_links=list(set(self.img_links))
self.media_links=list(set(self.media_links))
# replace the /wiki/ links for actual links. Presetve the old html, and use replaced_html from now on
self.replaced_html=self.html
for link_tag in soup.find_all('a'):
link=link_tag.get('href')
if link is not None:
if link[:6]=="/wiki/":
mapped_link="https://"+self.wiki_domain+link
self.replaced_html=self.replaced_html.replace('href="%s'%link, 'href="%s'%mapped_link)
def get_external_styling(self):
return self._page.get_external_styling()
def write_to_file(self, folderpath, replaced=True):
file_path=text_manip.make_file_path(folderpath, filename=self.heading.strip(), extension=".html")
# print "self.html type: %s"%type(self.html)
with open(file_path,"w") as output_file:
if replaced:
output_file.write(text_manip.ensure_ASCII(self.replaced_html))
else:
output_file.write(text_manip.ensure_ASCII(self.html))
return file_path
class Wikipedia_img_page_object:
#the images on wikipedia pages are their own images
# the urls are of type: https://<wiki domain>/wiki/File:<image name>.<extension>
def __init__(self, wikipedia_image_url=None, direct_url=None):
self.direct_image_link=None
self.saved_path=None
self.saved_filename=None
if direct_url is not None:
self.direct_image_link=direct_url
elif wikipedia_image_url is not None:
img_page=HTML_page_Obj(wikipedia_image_url)
img_soup=img_page.make_soup()
# print img_soup.find('div', _class="mw-filepage-resolutioninfo")
self.main_img_link=img_soup.find('div', class_="fullMedia").find('a', class_="internal").get('href')
if self.main_img_link[:9]=="//upload.":
self.main_img_link="https:"+self.main_img_link
img_links_soup=img_soup.find("div", class_="mw-filepage-resolutioninfo")
self.img_px_to_links_map={} #maps the product of height and width to tuples of pixel dimensions and their links
for img_link in img_links_soup.find_all('a'):
actual_link=img_link.get('href')
if actual_link[:9]=="//upload.":
actual_link="https:"+actual_link
img_link_text= text_manip.ensure_UTF8(re.sub(",","", img_link.get_text()) )
# print text_manip.ensure_ASCII("\n\nimg_link_text: %s"%img_link_text)
img_dimensions = re.findall('\d+', img_link_text)
width=int(img_dimensions[0])
height=int(img_dimensions[1])
if len(img_dimensions)==2:
self.img_px_to_links_map[width*height]=( actual_link, width, height ) #does mapping
else: print "Cannot use image %s, which is at %s with dimensions %s"%(wikipedia_image_url, img_link.get('href'), img_dimensions)
def download_direct_image(self,folderpath, printing=False):
if self.direct_image_link is not None:
link = self.direct_image_link #don'e modify self.direct_image_link
if link[:6] != "https:":
link="https:"+link
self.saved_path=text_manip.get_file( url=link, folderpath=folderpath, printing=False)
self.saved_filename=self.saved_path.split('/')[-1]
else:
download_main_image(folderpath, printing)
def download_main_image(self,folderpath, printing=False):
self.saved_path=text_manip.get_file( url=self.main_img_link, folderpath=folderpath, printing=False)
self.direct_image_link=self.main_img_link.split("https:")[-1]
self.saved_filename=self.saved_path.split('/')[-1]
if printing:
print "\n\n\nself.saved_filename:",self.saved_filename
def download_largest_image(self, folderpath, printing=False):
if self.img_px_to_links_map!={}:
max_index=max(self.img_px_to_links_map)
max_link=self.img_px_to_links_map[max_index][0]
self.saved_path=text_manip.get_file( url=max_link, folderpath=folderpath, printing=False) #max(dictionary) == max key in dictionary
self.direct_image_link=max_link.split("https:")[-1]
if printing:
print "self.direct_image_link :",self.direct_image_link
self.saved_filename=self.saved_path.split('/')[-1]
else:
if printing:
print "ERROR in download_smallest_image: The extraction is empty. Downloading main image instead"
self.download_main_image(folderpath)
def download_smallest_image(self, folderpath, printing=False):
if self.img_px_to_links_map!={}:
min_index=min(self.img_px_to_links_map)
min_link=self.img_px_to_links_map[min_index][0]
self.saved_path=text_manip.get_file( url=min_link, folderpath=folderpath, printing=False )
self.direct_image_link=min_link.split("https:")[-1]
if printing:
print "self.direct_image_link :",self.direct_image_link
self.saved_filename=self.saved_path.split('/')[-1]
else:
if printing:
print "ERROR in download_smallest_image: The extraction is empty. Downloading main image instead"
# self.download_main_image(folderpath)
def download_wiki_page(wiki_link, parent_folder_path, printing=False):
# downloads wikipedia pages and images (in a seperate folder), and links the pages to the images:
# make an object of the wiki link:
wiki_obj=Wiki_page_obj(wiki_link)
# make folder:
imgs_folder_name="imgs_%s"%(wiki_obj.heading)
imgs_folder_path=text_manip.make_folder_path(parent_folder_path=parent_folder_path, folder_name=imgs_folder_name)
text_manip.make_directory_if_not_exists(imgs_folder_path, printing=False)
# download all images and make replacements to HTML:
for img_link in wiki_obj.img_links:
if printing:
print ""
print "img_link:",img_link
# download image:
image=Wikipedia_img_page_object(direct_url=img_link)
image.download_direct_image(imgs_folder_path)
if image.saved_filename is not None:
# make modification/s to wiki_obj.replaced_html:
image_filepath=text_manip.make_file_path(folderpath="./"+imgs_folder_name, filename=image.saved_filename, extension="")
# change <a href="____"> to <a href=image_filepath>
if printing:
print "image_filepath:",image_filepath
# old=wiki_obj.replaced_html
wiki_obj.replaced_html = wiki_obj.replaced_html.replace('href="%s'%wiki_obj.direct_img_to_img_map[img_link], 'href="%s'%image_filepath)
# if old == wiki_obj.replaced_html:
# print "ERROR: NO replacements of <a> href"
# change <img src="____"> to <img src=image_filepath>
if printing:
print "image.direct_image_link:",image.direct_image_link
# old=wiki_obj.replaced_html
wiki_obj.replaced_html = wiki_obj.replaced_html.replace('src="%s'%image.direct_image_link, 'src="%s'%image_filepath)
# if old == wiki_obj.replaced_html:
# print "ERROR: NO replacements of <img> src"
# write replaced_html to file:
output_wiki_filepath = wiki_obj.write_to_file(parent_folder_path)
# return output filepath for later use
return (output_wiki_filepath, wiki_obj.article_links)
def link_to_style_file(style_file_path, input_html_file_path=None, input_raw_html=None, remove_external_stylesheets=False):
html=""
if input_raw_html is not None:
html=input_raw_html
elif input_html_file_path is not None:
with open(input_html_file_path, "r") as input_html_file:
html=input_html_file.read()
else:
print "ERROR: in link_to_style_file, both input_html_file and input_raw_html cannot be None"
exit(0)
if remove_external_stylesheets:
html=text_manip.regex_and_remove(regex='<link.*rel="stylesheet".*?/.*?>', text=html)
html=html.replace("</head>", '<link rel="stylesheet" href="%s"/></head>'%style_file_path)
if input_raw_html is not None:
return html
if input_html_file_path is not None:
with open(input_html_file_path, "w") as output_html_file:
output_html_file.write(html)
def replace_links(folderpath, downloaded_pages, style_file_relative_path):
html_file_names= text_manip.list_files_in_folder_with_extension(folderpath=folderpath, extension=".html")
html=""
for html_file_name in html_file_names:
filepath=text_manip.make_file_path(folderpath=folderpath, filename=html_file_name, extension="")
with open(filepath, "r") as html_file:
html=html_file.read()
for page_link in downloaded_pages:
if folderpath.split('/')[-1][:5]=="root_": # if we are dealing with the root html file
filesystem_link="./lvl%s/%s"%(downloaded_pages[page_link][0], downloaded_pages[page_link][1])
else:
if downloaded_pages[page_link][0] == 0: # root
filesystem_link="../%s"%(downloaded_pages[page_link][1])
else: filesystem_link="./lvl%s/%s"%(downloaded_pages[page_link][0], downloaded_pages[page_link][1])
# old = html
html=text_manip.HTML_attribute_content_replace(text=html, attr="href", current=page_link, replace_with=filesystem_link)
# if html !=old:
# print "REPLACED: %s----->%s"%(page_link, filesystem_link)
html_file.close()
html=link_to_style_file(style_file_path=style_file_relative_path, input_raw_html=html, remove_external_stylesheets=True)
with open(filepath, "w") as html_file:
html_file.write(html)
html_file.close()
def wiki_get_all(root_link, max_depth=1, input_root_folderpath="./", skip_already_downloaded=False, force_redo=False):
'''
Given a root link and the input_root_folderpath, it creates a folder, input_root_folderpath/root_<root.heading>, inside which we have the root link, it's associated images, and folders "lvl1", "lvl2", etc. (i.e. as input_root_folderpath/root_<root.heading>/lvl1/, input_root_folderpath/root_<root.heading>/lvl2/, etc.), into which we must download all the links of the closure, upto "lvl<max_depth>". These are organized sequentially because it decreases the overall path length, than if the folders lvl1, lvl2 etc. were nested inside each other.
This design also allows you to copy or move the folder root_<root.heading> to any place you desire; it will still work so long as you don't move around the files and folders inside. So, basically, this application, given a root link about a certain topic, downloads all the associated articles related to a certain topic. It captures them in a neat little package that you can then send to someone else, so that they can learn about the topic (this was the main incentive for building this application).
Salient points to be noted while using the application:
The application essentially is a breadth-first-search, because a depth-first search on the <strong>INTERNET</strong> can lead you anywhere at all and is inadvisabe, espcially when trying to learn about something which first requires you learn from links to other items, such as Wikipedia and other encyclopediac sites.
The files and folders that are downloaded form a self-contained ecosystem. The hyperlinks in the html have been modified, so that they now point to each other in the filesystem. eg: an html file in input_root_folderpath/root_<root.heading>/lvl1/ which originally had a link in it's article such as "https://en.wikipedia.org/wiki/IBM", might have the link replaced by ../lvl2/IBM.html, if we downloaded IBM.html to /lvl2/.
This allows us to start at any link and browse any link that is downloaded (though you'd probably want to start at the root, since that's why you ran this application).
Obviously, some links are not downloaded, because of max_depth. The non-downloaded links are those not in the closure of depth max_depth; by definition, these will be found, if at all, in the pages in the last level, i.e. /lvl<max_depth>. The application provides hyperlinks for non-downloaded links, so you may connect to the internet and visit them.
Suppose you realize that your max_depth specification was not sufficient. No problem, you don't need to download all the links over again. You just set the skip_already_downloaded flag to True, and the function gets all the links from the last level, upto the new max_depth.
Note: because this flag is there to reduce data usage, it will necessarily skip over all the /lvl<depth>/ folders, without checking the actual contents (because tracing the links would require downloading all the pages' HTML all over again). The assumption is that they have been downloaded correctly, and the application will start getting fresh links from the files of /lvl<max_depth>/.
Styling is another issue. Since this application explores article links which are contained to Wikipedia, the styling is assumed to be the same for all the links. Thus, only the styling of the root is downloaded, and all the files are made to link to that style file. It is possible (and fairly easy) to implement a feature that downloads the styles for each file seperately, but for now it is not implemented.
This in fact brings me to the most important point of this application: swappability and generalization. This application may have been crafted espcially for Wikipedia, but if you have the patience, it can easily be generalized for any website. That makes it a very powerful data and research tool.
This function works in two passes:
Pass1:
- start with the root, make a seperate folder for it inside the input_root_folderpath, as input_root_folderpath/root_<root.heading>
- create a folder inside this as input_root_folderpath/root_<root.heading>/img_<root.heading>
- download its associated images to input_root_folderpath/root_<root.heading>/img_<root.heading>, modify the root's html to link to these images, then write to file the root's modified html (to input_root_folderpath/root_<root.heading>)
- append the root's filename and level to a dict, where
- key is the root's real url
- values are a tuple: (level, file_name)
- create a level-wise dict of all article_links of that level. For the root, that is just the root's article_links.
- Do the following for every level (root's level is 0):
- Create a folder called "lvl"+str(current_level+1) in the input_root_folderpath/root_<root.heading>
- for each link in this level:
- check the dictionary if the dict[link] is empty, i.e. (level, filename0 does not exist. If it exists, then we already have the file, so skip to the next child article_link.
- If the link does not exist, download the link & images to input_root_folderpath/root_<root.heading>/lvl1/, (same format as we did the root).
- append the child article_link's to dict[link]=(level, filename), i.e. to the dictionary of downloaded links, so that it is not downloaded twice in the future.
- append this link's child article_links, to the level-order child article_links dict (make sure no repeats
Pass2:
- links all the files to one another.
We keep this as two seperate passes to reduce redundant data usage (by not downloading links that are already there) at the (minimal) cost of time.
'''
root_wiki_obj=Wiki_page_obj(wikipedia_url=root_link, just_heading=True) #gets only the file and its heading, no further processing
# make root_<root.heading> folder inside input_root_folderpath:
root_folder_name="root_%s"%(root_wiki_obj.heading)
root_folder_path=text_manip.make_folder_path(parent_folder_path=input_root_folderpath, folder_name=root_folder_name)
text_manip.make_directory_if_not_exists(root_folder_path, printing=False)
# download style file:
full_style_file_path=text_manip.make_file_path(root_folder_path, "wiki_style",".css")
with open(full_style_file_path, "w") as style_file:
style_file.write(text_manip.ensure_ASCII(root_wiki_obj.get_external_styling() ))
# download the root and all of its associated images:
root_download_tuple=download_wiki_page(wiki_link=root_link, parent_folder_path=root_folder_path)
root_full_html_filepath= root_download_tuple[0]
# link to style file:
relative_root_style_path="./"+full_style_file_path.split('/')[-1]
# print "relative_root_style_path:",relative_root_style_path
link_to_style_file(style_file_path=relative_root_style_path, input_html_file_path=root_full_html_filepath)
relative_style_path="../"+full_style_file_path.split('/')[-1] # to be used for all other article files
# make a dict of already downloaded pages. Use the relative paths
downloaded_pages=defaultdict(lambda:None)
downloaded_pages[root_link]=(0,root_full_html_filepath.split('/')[-1])
# make a dict of pages we must now download
associated_links=defaultdict(lambda:[])
associated_links[0]=root_download_tuple[1]
print "Level 0 (root level) :\n"
print "\tNumber of pages obtained at this level= "+str(len(downloaded_pages))
print "\tNumber of links to next level= "+str(len(associated_links[0]))
# for link in associated_links[0]:
# print link
for current_level in range(0,max_depth):
print "\n\nLevel %s :\n"%(current_level+1)
download_count=0
existing_downloaded_count=len(downloaded_pages)
child_folder_name="lvl%s"%(current_level+1)
child_folder_path=text_manip.make_folder_path(parent_folder_path=root_folder_path, folder_name=child_folder_name)
if force_redo or text_manip.make_directory_if_not_exists(child_folder_path) :
# i.e. if the directory does not already exist, do the following; if it does, we skip it
for associated_link in associated_links[current_level]:
if downloaded_pages[associated_link] is None:
download_tuple = download_wiki_page(wiki_link=associated_link, parent_folder_path=child_folder_path)
# print "\rDone Downloading",
if current_level+1 != max_depth:
associated_links[current_level+1].append(download_tuple[1])
downloaded_pages[associated_link]=( current_level+1, download_tuple[0].split('/')[-1] )
download_count+=1
print "\r\tNumber of pages downloaded so far = %s"%download_count,
if current_level+1 != max_depth:
associated_links[current_level+1]=list(set(associated_links[current_level+1][0]))
new_download_count=len(downloaded_pages)
print "\n\tNumber of pages obtained at this level= "+str(new_download_count-existing_downloaded_count)
print "\tNumber of links to next level= "+str(len(associated_links[current_level+1]))
# print "\n\n\nDOWNLOADED PAGES:"
# for page in downloaded_pages:
# print str(page),"--->",str(downloaded_pages[page])
# replace all links with links to the filesystem:
# replace root links:
style_file_relative_path="./wiki_style.css"
replace_links(folderpath=root_folder_path, downloaded_pages=downloaded_pages, style_file_relative_path=style_file_relative_path)
# replace rest of links
style_file_relative_path="../wiki_style.css"
for current_level in range(1,max_depth+1):
lvl_folder=text_manip.make_folder_path(parent_folder_path=root_folder_path, folder_name="lvl%s"%(current_level))
replace_links(folderpath=lvl_folder, downloaded_pages=downloaded_pages, style_file_relative_path=style_file_relative_path)