def write_to_file(self, folderpath, replaced=True): file_path=text_manip.make_file_path(folderpath, filename=self.heading.strip(), extension=".html") # print "self.html type: %s"%type(self.html) with open(file_path,"w") as output_file: if replaced: output_file.write(text_manip.ensure_ASCII(self.replaced_html)) else: output_file.write(text_manip.ensure_ASCII(self.html)) return file_path
def write_pretty_to_file(self, folderpath): file_name = self.url.split('/')[-1] self.html_soup_and_prettify() with open(text_manip.make_file_path(folderpath, file_name, ".html")) as output_html_file: output_html_file.write(text_manip.ensure_ASCII(self.html))
def wiki_get_all(root_link, max_depth=1, input_root_folderpath="./", skip_already_downloaded=False, force_redo=False): ''' Given a root link and the input_root_folderpath, it creates a folder, input_root_folderpath/root_<root.heading>, inside which we have the root link, it's associated images, and folders "lvl1", "lvl2", etc. (i.e. as input_root_folderpath/root_<root.heading>/lvl1/, input_root_folderpath/root_<root.heading>/lvl2/, etc.), into which we must download all the links of the closure, upto "lvl<max_depth>". These are organized sequentially because it decreases the overall path length, than if the folders lvl1, lvl2 etc. were nested inside each other. This design also allows you to copy or move the folder root_<root.heading> to any place you desire; it will still work so long as you don't move around the files and folders inside. So, basically, this application, given a root link about a certain topic, downloads all the associated articles related to a certain topic. It captures them in a neat little package that you can then send to someone else, so that they can learn about the topic (this was the main incentive for building this application). Salient points to be noted while using the application: The application essentially is a breadth-first-search, because a depth-first search on the <strong>INTERNET</strong> can lead you anywhere at all and is inadvisabe, espcially when trying to learn about something which first requires you learn from links to other items, such as Wikipedia and other encyclopediac sites. The files and folders that are downloaded form a self-contained ecosystem. The hyperlinks in the html have been modified, so that they now point to each other in the filesystem. eg: an html file in input_root_folderpath/root_<root.heading>/lvl1/ which originally had a link in it's article such as "https://en.wikipedia.org/wiki/IBM", might have the link replaced by ../lvl2/IBM.html, if we downloaded IBM.html to /lvl2/. This allows us to start at any link and browse any link that is downloaded (though you'd probably want to start at the root, since that's why you ran this application). Obviously, some links are not downloaded, because of max_depth. The non-downloaded links are those not in the closure of depth max_depth; by definition, these will be found, if at all, in the pages in the last level, i.e. /lvl<max_depth>. The application provides hyperlinks for non-downloaded links, so you may connect to the internet and visit them. Suppose you realize that your max_depth specification was not sufficient. No problem, you don't need to download all the links over again. You just set the skip_already_downloaded flag to True, and the function gets all the links from the last level, upto the new max_depth. Note: because this flag is there to reduce data usage, it will necessarily skip over all the /lvl<depth>/ folders, without checking the actual contents (because tracing the links would require downloading all the pages' HTML all over again). The assumption is that they have been downloaded correctly, and the application will start getting fresh links from the files of /lvl<max_depth>/. Styling is another issue. Since this application explores article links which are contained to Wikipedia, the styling is assumed to be the same for all the links. Thus, only the styling of the root is downloaded, and all the files are made to link to that style file. It is possible (and fairly easy) to implement a feature that downloads the styles for each file seperately, but for now it is not implemented. This in fact brings me to the most important point of this application: swappability and generalization. This application may have been crafted espcially for Wikipedia, but if you have the patience, it can easily be generalized for any website. That makes it a very powerful data and research tool. This function works in two passes: Pass1: - start with the root, make a seperate folder for it inside the input_root_folderpath, as input_root_folderpath/root_<root.heading> - create a folder inside this as input_root_folderpath/root_<root.heading>/img_<root.heading> - download its associated images to input_root_folderpath/root_<root.heading>/img_<root.heading>, modify the root's html to link to these images, then write to file the root's modified html (to input_root_folderpath/root_<root.heading>) - append the root's filename and level to a dict, where - key is the root's real url - values are a tuple: (level, file_name) - create a level-wise dict of all article_links of that level. For the root, that is just the root's article_links. - Do the following for every level (root's level is 0): - Create a folder called "lvl"+str(current_level+1) in the input_root_folderpath/root_<root.heading> - for each link in this level: - check the dictionary if the dict[link] is empty, i.e. (level, filename0 does not exist. If it exists, then we already have the file, so skip to the next child article_link. - If the link does not exist, download the link & images to input_root_folderpath/root_<root.heading>/lvl1/, (same format as we did the root). - append the child article_link's to dict[link]=(level, filename), i.e. to the dictionary of downloaded links, so that it is not downloaded twice in the future. - append this link's child article_links, to the level-order child article_links dict (make sure no repeats Pass2: - links all the files to one another. We keep this as two seperate passes to reduce redundant data usage (by not downloading links that are already there) at the (minimal) cost of time. ''' root_wiki_obj = Wiki_page_obj( wikipedia_url=root_link, just_heading=True ) #gets only the file and its heading, no further processing # make root_<root.heading> folder inside input_root_folderpath: root_folder_name = "root_%s" % (root_wiki_obj.heading) root_folder_path = text_manip.make_folder_path( parent_folder_path=input_root_folderpath, folder_name=root_folder_name) text_manip.make_directory_if_not_exists(root_folder_path, printing=False) # download style file: full_style_file_path = text_manip.make_file_path(root_folder_path, "wiki_style", ".css") with open(full_style_file_path, "w") as style_file: style_file.write( text_manip.ensure_ASCII(root_wiki_obj.get_external_styling())) # download the root and all of its associated images: root_download_tuple = download_wiki_page( wiki_link=root_link, parent_folder_path=root_folder_path) root_full_html_filepath = root_download_tuple[0] # link to style file: relative_root_style_path = "./" + full_style_file_path.split('/')[-1] # print "relative_root_style_path:",relative_root_style_path link_to_style_file( style_file_path=relative_root_style_path, input_html_file_path=root_full_html_filepath) relative_style_path = "../" + full_style_file_path.split('/')[ -1] # to be used for all other article files # make a dict of already downloaded pages. Use the relative paths downloaded_pages = defaultdict(lambda: None) downloaded_pages[root_link] = (0, root_full_html_filepath.split('/')[-1]) # make a dict of pages we must now download associated_links = defaultdict(lambda: []) associated_links[0] = root_download_tuple[1] print "Level 0 (root level) :\n" print "\tNumber of pages obtained at this level= " + str( len(downloaded_pages)) print "\tNumber of links to next level= " + str(len(associated_links[0])) # for link in associated_links[0]: # print link for current_level in range(0, max_depth): print "\n\nLevel %s :\n" % (current_level + 1) download_count = 0 existing_downloaded_count = len(downloaded_pages) child_folder_name = "lvl%s" % (current_level + 1) child_folder_path = text_manip.make_folder_path( parent_folder_path=root_folder_path, folder_name=child_folder_name) if force_redo or text_manip.make_directory_if_not_exists( child_folder_path): # i.e. if the directory does not already exist, do the following; if it does, we skip it for associated_link in associated_links[current_level]: if downloaded_pages[associated_link] is None: download_tuple = download_wiki_page( wiki_link=associated_link, parent_folder_path=child_folder_path) # print "\rDone Downloading", if current_level + 1 != max_depth: associated_links[current_level + 1].append( download_tuple[1]) downloaded_pages[associated_link] = ( current_level + 1, download_tuple[0].split('/')[-1]) download_count += 1 print "\r\tNumber of pages downloaded so far = %s" % download_count, if current_level + 1 != max_depth: associated_links[current_level + 1] = list( set(associated_links[current_level + 1][0])) new_download_count = len(downloaded_pages) print "\n\tNumber of pages obtained at this level= " + str( new_download_count - existing_downloaded_count) print "\tNumber of links to next level= " + str( len(associated_links[current_level + 1])) # print "\n\n\nDOWNLOADED PAGES:" # for page in downloaded_pages: # print str(page),"--->",str(downloaded_pages[page]) # replace all links with links to the filesystem: # replace root links: style_file_relative_path = "./wiki_style.css" replace_links( folderpath=root_folder_path, downloaded_pages=downloaded_pages, style_file_relative_path=style_file_relative_path) # replace rest of links style_file_relative_path = "../wiki_style.css" for current_level in range(1, max_depth + 1): lvl_folder = text_manip.make_folder_path( parent_folder_path=root_folder_path, folder_name="lvl%s" % (current_level)) replace_links( folderpath=lvl_folder, downloaded_pages=downloaded_pages, style_file_relative_path=style_file_relative_path)