def test_engine(): index, graph = crawl_web('http://www.udacity.com/cs101x/index.html') ranks = compute_ranks(graph) #print index print "_+_+_+_++_+_++_+_+_+_+_++_+_+_++" print lucky_search(index, ranks, 'walking') #>>> https://www.udacity.com/cs101x/index.html print lucky_search(index, ranks, 'kicking') #>>> https://www.udacity.com/cs101x/crawling.html print lucky_search(index, ranks, 'Ossifrage') #>>> https://www.udacity.com/cs101x/flying.html print lucky_search(index, ranks, 'ossifrage') #>>> None print "_+_+_+_++_+_++_+_+_+_+_++_+_+_++" print ordered_search(index, ranks, 'to') #>>> https://www.udacity.com/cs101x/index.html print ordered_search(index, ranks, 'Ossifrage') #>>> https://www.udacity.com/cs101x/flying.html print ordered_search(index, ranks, 'crawl') #>>> index crawling print ordered_search(index, ranks, 'ossifrage')
def test_engine(): print "Testing..." start = datetime.datetime.now(); result = crawl_web('040914') print "Finished tests." delta = datetime.datetime.now()-start print delta/len(result)
def post(self): corpus, graph = crawl_web('http://udacity.com/cs101x/urank/index.html') ranks = compute_ranks(graph) query = self.request.get('text') result = lucky_search(corpus, ranks, query) if not result: self.render(text="", links="try www.google.com") else: self.render(text=query, links=result)
def post(self): corpus, graph = crawl_web('http://udacity.com/cs101x/urank/index.html') ranks = compute_ranks(graph) query = self.request.get('text') result = lucky_search(corpus, ranks, query) if not result: self.render(text = "", links = "try www.google.com") else: self.render(text = query, links = result)
def main(keyword): name = 'youfollowthefilm' fname = name + '.pkl' domain = 'http://www.' + name + '.com' try: with open(fname, 'r') as fout: website = pickle.load(fout) except: website = crawl_web(domain) return search(website, keyword), website._titles
def post(self): key = self.request.get('key') url = Url.get_by_key_name(key) if url is None: page, links = crawler.crawl_web(key) def txn(): url = Url.get_by_key_name(key) if url is None: url = Url(key_name=key, uri=key) for l in links: taskqueue.add(queue_name='url-crawler-queue', url='/queue', params={'key': l}) url.put() db.run_in_transaction(txn)
def test(): print "Testing..." from studentMain import LuckySearch index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html') ranks = compute_ranks(graph) kathleen = 'http://udacity.com/cs101x/urank/kathleen.html' nickel = 'http://udacity.com/cs101x/urank/nickel.html' arsenic = 'http://udacity.com/cs101x/urank/arsenic.html' hummus = 'http://udacity.com/cs101x/urank/hummus.html' indexurl = 'http://udacity.com/cs101x/urank/index.html' lucky = LuckySearch() print lucky.GET('Hummus') == kathleen print lucky.GET('the') == nickel print lucky.GET('babaganoush') == "Try searchwithpeter.info." print "Finished tests."
def test_suite(): print "Testing...\n" from studentMain import LuckySearch index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html') ranks = compute_ranks(graph) kathleen = 'http://udacity.com/cs101x/urank/kathleen.html' nickel = 'http://udacity.com/cs101x/urank/nickel.html' arsenic = 'http://udacity.com/cs101x/urank/arsenic.html' hummus = 'http://udacity.com/cs101x/urank/hummus.html' indexurl = 'http://udacity.com/cs101x/urank/index.html' lucky = LuckySearch() print lucky.GET('Hummus') == kathleen print lucky.GET('the') == nickel print lucky.GET('babaganoush') == "Try searchwithpeter.info." print "\nFinished tests."
def main(): name = 'youfollowthefilm' fname = name + '.pkl' domain = 'http://www.' + name + '.com' try: with open(fname, 'r') as fout: website = pickle.load(fout) print "Succesfully read my_site from " + fname except: website = crawl_web(domain) try: with open(fname, 'w') as fout: pickle.dump(website, fout) print "Succesfully wrote my_site to " + fname except IOError, e: print "Cannot write out my_site: " + str(e)
def test_engine(): print "Testing..." kathleen = 'http://udacity.com/cs101x/urank/kathleen.html' nickel = 'http://udacity.com/cs101x/urank/nickel.html' arsenic = 'http://udacity.com/cs101x/urank/arsenic.html' hummus = 'http://udacity.com/cs101x/urank/hummus.html' indexurl = 'http://udacity.com/cs101x/urank/index.html' corpus = crawl_web('http://udacity.com/cs101x/urank/index.html') assert lucky_search(corpus, 'Hummus') == kathleen assert ordered_search(corpus, 'Hummus') == [kathleen, nickel, arsenic, hummus, indexurl] assert lucky_search(corpus, 'the') == nickel assert ordered_search(corpus, 'the') == [nickel, arsenic, hummus, indexurl] assert lucky_search(corpus, 'babaganoush') == None assert ordered_search(corpus, 'babaganoush') == None print "Finished tests."
def test_engine(): print "Testing..." kathleen = 'http://udacity.com/cs101x/urank/kathleen.html' nickel = 'http://udacity.com/cs101x/urank/nickel.html' arsenic = 'http://udacity.com/cs101x/urank/arsenic.html' hummus = 'http://udacity.com/cs101x/urank/hummus.html' indexurl = 'http://udacity.com/cs101x/urank/index.html' corpus = crawl_web('http://udacity.com/cs101x/urank/index.html') fname = 'corpus.pkl' try: with open(fname, 'w') as fout: pickle.dump(corpus, fout) print "Pickled file to " + fname except IOError, e: print "Failed to write to file: " + str(e)
def test_engine(): print "Testing..." kathleen = 'http://udacity.com/cs101x/urank/kathleen.html' nickel = 'http://udacity.com/cs101x/urank/nickel.html' arsenic = 'http://udacity.com/cs101x/urank/arsenic.html' hummus = 'http://udacity.com/cs101x/urank/hummus.html' indexurl = 'http://udacity.com/cs101x/urank/index.html' corpus = crawl_web('http://udacity.com/cs101x/urank/index.html') fname = 'corpus.pkl' # YOUR CODE HERE try: with open(fname, 'w') as fout: pickle.dump(corpus, fout) print "Successfully wrote to " + fname except IOError, e: print "Most odacious! Cannot write to corpus " + str(e)
def test_engine(): print "Testing..." kathleen = "http://udacity.com/cs101x/urank/kathleen.html" nickel = "http://udacity.com/cs101x/urank/nickel.html" arsenic = "http://udacity.com/cs101x/urank/arsenic.html" hummus = "http://udacity.com/cs101x/urank/hummus.html" indexurl = "http://udacity.com/cs101x/urank/index.html" corpus = crawl_web("http://udacity.com/cs101x/urank/index.html") fname = "corpus.pkl" # YOUR CODE HERE assert lucky_search(corpus, "Hummus") == kathleen assert ordered_search(corpus, "Hummus") == [kathleen, nickel, arsenic, hummus, indexurl] assert lucky_search(corpus, "the") == nickel assert ordered_search(corpus, "the") == [nickel, arsenic, hummus, indexurl] assert lucky_search(corpus, "babaganoush") == None assert ordered_search(corpus, "babaganoush") == None print "Finished tests."
def test_engine(): print "Testing..." index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html') ranks = compute_ranks(graph) kathleen = 'http://udacity.com/cs101x/urank/kathleen.html' nickel = 'http://udacity.com/cs101x/urank/nickel.html' arsenic = 'http://udacity.com/cs101x/urank/arsenic.html' hummus = 'http://udacity.com/cs101x/urank/hummus.html' indexurl = 'http://udacity.com/cs101x/urank/index.html' # print lucky_search(index, ranks, 'Hummus') assert lucky_search(index, ranks, 'Hummus') == kathleen #print ordered_search(index, ranks, 'Hummus') assert ordered_search(index, ranks, 'Hummus') == [kathleen, nickel, arsenic, hummus, indexurl] #print lucky_search(index, ranks, 'the') assert lucky_search(index, ranks, 'the') == nickel #print ordered_search(index, ranks, 'the') assert ordered_search(index, ranks, 'the') == [nickel, arsenic, hummus, indexurl] #print lucky_search(index, ranks, 'babaganoush') assert lucky_search(index, ranks, 'babaganoush') == None assert ordered_search(index, ranks, 'babaganoush') == None print "Finished tests."
def test_engine(): print "Testing..." keyword = 'star wars' if keyword.split() == []: print 'Please input something.' n = 20 g = pygoogle(keyword) g.pages = 2 tocrawl = [] for url in g.get_urls()[0:10]: heappush(tocrawl, (-sys.maxint - 1, url)) crawled = crawl_web(tocrawl, keyword, n) #print crawled #urls = rank_level_search(corpus, crawled) #for url in urls: # print url print "Now finish crawling, please check the result in result.txt" print "The accururay rate is",test(keyword,n) print "Finished tests."
def test_engine(): print "Testing..." index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html') ranks = compute_ranks(graph) kathleen = 'http://udacity.com/cs101x/urank/kathleen.html' nickel = 'http://udacity.com/cs101x/urank/nickel.html' arsenic = 'http://udacity.com/cs101x/urank/arsenic.html' hummus = 'http://udacity.com/cs101x/urank/hummus.html' indexurl = 'http://udacity.com/cs101x/urank/index.html' # print lucky_search(index, ranks, 'Hummus') assert lucky_search(index, ranks, 'Hummus') == kathleen #print ordered_search(index, ranks, 'Hummus') assert ordered_search(index, ranks, 'Hummus') == [ kathleen, nickel, arsenic, hummus, indexurl ] #print lucky_search(index, ranks, 'the') assert lucky_search(index, ranks, 'the') == nickel #print ordered_search(index, ranks, 'the') assert ordered_search(index, ranks, 'the') == [nickel, arsenic, hummus, indexurl] #print lucky_search(index, ranks, 'babaganoush') assert lucky_search(index, ranks, 'babaganoush') == None assert ordered_search(index, ranks, 'babaganoush') == None print "Finished tests."
#Main file from crawler import crawl_web from search import lucky_search, ordered_search from crawler import crawl_web from search import lucky_search, ordered_search print "Please enter a seed page:" seed = raw_input() corpus = crawl_web(seed) print "Index\n\n\n:" print corpus.index print "\n\n\n" print "Graph:\n\n\n" print corpus.graph print "\n\n\n" print "Ranks:\n\n\n" print corpus.ranks print "\n\n\n"
# This is the main file you should make changes # # To run this code locally and figure out how it works # please download the code from our GitHub page # http://udacity.github.io/cs101 # and run the server locally - python studentMain.py # from search import lucky_search from crawler import crawl_web, compute_ranks corpus, graph = crawl_web('http://udacity.com/cs101x/urank/index.html') ranks = compute_ranks(graph) class LuckySearch(object): def GET(self, query): result = lucky_search(corpus, ranks, query) return result # running some tests from test import test_suite test_suite() # This will be executed only if you run this code locally # using a command: python studentMain.py if __name__ == "__main__": import web app = web.application(('/(.*)', 'LuckySearch'), globals()) corpus, graph = crawl_web('http://udacity.com/cs101x/urank/index.html') app.run()
# coding: utf-8 import sys,os sys.path.append(os.pardir) from crawler import crawl_web crawl_web('https://ja.wikipedia.org')
def search(environment, seedBase): start = clock() crawl_web(environment, index, graph, seedBase, 25, 1) result = look_up(index, 'PayPal,') endTime = clock() - start return endTime, result
def crawl(): from crawler import crawl_web limite = input("Inserir número máximo de sites a pesquisar: ") webpage = 'https://www.eapn.pt/links/governo-da-republica-portuguesa-e-instituicoes-publicas' crawl_web(webpage, limit=limite)
# Modify this file to make sure that it behaves as asked in the video: # if the request is '/about', the server should respond with: # 'This is my udacious project!' # for all other requests, the server should respond with a normal # search response. # # Hint: you will need to add two strings to the tuple of inputs for # web.application(...) as well as define a new class. # # To test your code locally you have to install web.py and all of # these files locally as well. from search import lucky_search from crawler import crawl_web, compute_ranks corpus, graph = crawl_web('http://udacity.com/cs101x/urank/index.html') ranks = compute_ranks(graph) class LuckySearch(object): def GET(self, query): result = lucky_search(corpus, ranks, query) return result class About(object): def GET(self, query): return 'This is my udacious project!' # This will be executed only if you run this code locally
import crawler import RankAndSearch as core if name == "__main__": print "Welcome to DataSearch!" print "Please enter a seed url!" seed = raw_input() print "Please enter a depth limit" depth = raw_input() print "Please enter a breadth limit" breadth = raw_input() index, graph = crawler.crawl_web(seed,depth,breadth) ranks = core.compute_ranks(graph) while True: print "please enter what you wish to search for!" term = raw_input() print core.search(index,ranks,term) print index
maxc = 50000 while count < maxc: print 'crawling web' data_file = open('tocrawl_json.json', 'r') data = json.load(data_file) url = data['tocrawl'] crawled_data_file = open('crawled_json.json', 'r') crawled_data = json.load(crawled_data_file) crawled = crawled_data['crawled'] pages_crawled = len(crawled) print pages_crawled corpus = crawl_web(url,3,crawled) print 'crawled' ### After the crawler has done its shizzle it sends the data as JSON to my site ### feel free to uncomment but i don't know how much data the server can recieve at once, ### also it might be too much for Mysql to handle at once' #json_corpus = corpus.to_JSON() #print 'saving corpus to server' #url = "http://www.josephmohan.co.uk/crawler/json_reciever"
from crawler import crawl_web from search import urank, lucky_search, lookup #test index, graph = crawl_web('www.dmoz.org') #seed page ranks = urank(graph) keyword = raw_input('Enter search keyword: ') print (lucky_search(index, ranks, keyword)) print (lookup(index, keyword)) #Stuff to add: # 1. Support for multiple keywords # 2. Integration with web page # 3. Removal of useless keywords and punctuation # 4. Add meaning to punctuation # 5. Listing of all results according to their rank #****6. A better page rank algorithm**** # #-End-
# coding: utf-8 import sys, os sys.path.append(os.pardir) from crawler import crawl_web crawl_web('https://ja.wikipedia.org')
#!/usr/bin/python -tt from crawler import crawl_web, compute_ranks from search import lucky_search, lookup_best wc = crawl_web('localhost', 200) ranks = compute_ranks(wc, 50) print 'Enter 0 to exit.' while True: user_input = raw_input("Enter: keyword any_number_for_lucky_search [e.g. php]: ") if user_input: input = user_input.split() word = input[0] option = 1 if len(input) == 1 else 2 if word == '0': break if option == 1: results = lookup_best(word, wc, ranks) if results: for result in results[:25]: print result, '\t'*4, ranks[result] else: print 'No results!' else: print lucky_search(word, wc, ranks) or 'Not lucky! -_-' print print '\nThanks for using me!'