コード例 #1
0
ファイル: webcrawler.py プロジェクト: riwazp7/search_engine
# A make-shift webcrawler: need to get out of loops in webpages
# (c) 2016 RIWAZ POUDYAL, PREKSHA KOIRALA

import link_parse #Custom library. Can be found inside my_library or inside current folder
import queue

# Queue of links we have to crawl at
q = queue.Queue()

# No of sites looked at until now
count = 0

# Ask site from user at first or if our queue runs out of sites to analyze
def askSite():
    return input('Enter a site: ')


while(True):
    if q.empty(): q.put(askSite()) # Ask site if ran out
    
    # Get a list of links in the webpage
    l = link_parse.parseLink(q.get())
    
    #Print each link and enqueue it for analysis
    for site in l:
        count += 1
        print (count, site)
        q.put(site)


コード例 #2
0
ファイル: engine.py プロジェクト: riwazp7/search_engine
'   Main Class for search engine I guess. Don't yet know what goes here haha.
'   (c) 2016 Riwaz Poudyal
'''

import link_parse
import word_freq
import text_parse
import queue
import webpage

# List of webpage object for each webpage we have looked at so far
# Have to make sure new objects we create aren't already in the list to avoid loops
# Also need a way to write down data. Cannot hold everything in memory

wpage_list = []
link_queue = queue.Queue()

def askLink(): return input("enter a unique link: ")

while (True):
    
    if (len(link_queue) < 1): link_queue.put(askLink())
    
    pagelink = link_queue.get()
    freq_list = word_freq.findFreq(pagelink)
    link_list = link_parse.parseLink(pagelink)

    for link in link_list: link_queue.put(link)

    wpage_list.append(webpage(pagelink, freq_list, link_list))