# A make-shift webcrawler: need to get out of loops in webpages # (c) 2016 RIWAZ POUDYAL, PREKSHA KOIRALA import link_parse #Custom library. Can be found inside my_library or inside current folder import queue # Queue of links we have to crawl at q = queue.Queue() # No of sites looked at until now count = 0 # Ask site from user at first or if our queue runs out of sites to analyze def askSite(): return input('Enter a site: ') while(True): if q.empty(): q.put(askSite()) # Ask site if ran out # Get a list of links in the webpage l = link_parse.parseLink(q.get()) #Print each link and enqueue it for analysis for site in l: count += 1 print (count, site) q.put(site)
' Main Class for search engine I guess. Don't yet know what goes here haha. ' (c) 2016 Riwaz Poudyal ''' import link_parse import word_freq import text_parse import queue import webpage # List of webpage object for each webpage we have looked at so far # Have to make sure new objects we create aren't already in the list to avoid loops # Also need a way to write down data. Cannot hold everything in memory wpage_list = [] link_queue = queue.Queue() def askLink(): return input("enter a unique link: ") while (True): if (len(link_queue) < 1): link_queue.put(askLink()) pagelink = link_queue.get() freq_list = word_freq.findFreq(pagelink) link_list = link_parse.parseLink(pagelink) for link in link_list: link_queue.put(link) wpage_list.append(webpage(pagelink, freq_list, link_list))