def genRunner(url): '''This module runs the process''' page = makePage(url) if page: soup = makeSoup(page, url, 'html5lib') if not soup.body or not soup.head: print 'no body or no head found...' else: CONTENT = getAllContent(soup) CONTENT = postprocContent(CONTENT) return CONTENT
def genRunner(url): '''This module runs the algorithm''' # Why am I not using soup stuff?? page = makePage(url) if page: soup = makeSoup(page, url, 'html5lib') if not soup.body or not soup.head: print 'no body or no head found...' else: meta_info, CONTENT = getAllContent(soup) if meta_info and CONTENT: CONTENT = postprocContent(CONTENT) return meta_info, CONTENT else: return None, None
import sys, os import prepwork import alltheMeta def mergeOthDict(mod, o): '''merges the new "other" dict, 'o', with the running "other" dict, mod''' for key in o.keys(): if mod.has_key(key): mod[key] = mod[key].union(o[key]) else: mod[key] = o[key] return mod if __name__=="__main__": url_file = sys.argv[1] mod = dict() if os.path.isfile(url_file): with open(url_file, 'r') as f1: urls = f1.read() urls = urls.split('\n') urls = [url.strip() for url in urls if url.strip()] for url in urls: page = prepwork.makePage(url) soup = prepwork.makeSoup(page, url) _, o = alltheMeta.getAllTheMeta(soup) mod = mergeOthDict(mod, o) for key in mod.keys(): print key print mod[key] print '\n'