Ejemplos de crawl_web en Python, ejemplos de crawler.crawl_web en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test.py Proyecto: timkaboya/python-web-crawler

def test_engine():
    index, graph = crawl_web('http://www.udacity.com/cs101x/index.html')
    ranks = compute_ranks(graph)
    #print index
    print "_+_+_+_++_+_++_+_+_+_+_++_+_+_++"
    print lucky_search(index, ranks, 'walking')
    #>>> https://www.udacity.com/cs101x/index.html

    print lucky_search(index, ranks, 'kicking')
    #>>> https://www.udacity.com/cs101x/crawling.html

    print lucky_search(index, ranks, 'Ossifrage')
    #>>> https://www.udacity.com/cs101x/flying.html

    print lucky_search(index, ranks, 'ossifrage')
    #>>> None

    print "_+_+_+_++_+_++_+_+_+_+_++_+_+_++"
    print ordered_search(index, ranks, 'to')
    #>>> https://www.udacity.com/cs101x/index.html

    print ordered_search(index, ranks, 'Ossifrage')
    #>>> https://www.udacity.com/cs101x/flying.html

    print ordered_search(index, ranks, 'crawl')
    #>>> index crawling

    print ordered_search(index, ranks, 'ossifrage')

Ejemplo n.º 2

0

Mostrar archivo

Archivo: friends_bfs.py Proyecto: shz117/WebSearch

def test_engine():
    print "Testing..."

    start = datetime.datetime.now();

    result = crawl_web('040914')

    print "Finished tests."
    delta = datetime.datetime.now()-start
    print delta/len(result)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: main.py Proyecto: tkamag/moocs_solution

    def post(self):

        corpus, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
        ranks = compute_ranks(graph)
        query = self.request.get('text')
        result = lucky_search(corpus, ranks, query)
        if not result:
            self.render(text="", links="try www.google.com")
        else:
            self.render(text=query, links=result)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: main.py Proyecto: codingang/moocs_solution

    def post(self):

        corpus, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
        ranks = compute_ranks(graph)
        query = self.request.get('text')
        result = lucky_search(corpus, ranks, query)
        if not result:
            self.render(text = "", links = "try www.google.com")
        else:
            self.render(text = query, links = result)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: search-engine.py Proyecto: ik2/search-engine

def main(keyword):
    name = 'youfollowthefilm'
    fname = name + '.pkl'
    domain = 'http://www.' + name + '.com'
    try:
        with open(fname, 'r') as fout:
            website = pickle.load(fout)
    except:
        website = crawl_web(domain)
    return search(website, keyword), website._titles

Ejemplo n.º 6

0

Mostrar archivo

Archivo: search-engine.py Proyecto: ik2/search-engine

def main(keyword):
    name = 'youfollowthefilm'
    fname = name + '.pkl'
    domain = 'http://www.' + name + '.com'
    try:
        with open(fname, 'r') as fout:
            website = pickle.load(fout)
    except:
        website = crawl_web(domain)
    return search(website, keyword), website._titles

Ejemplo n.º 7

0

Mostrar archivo

Archivo: worker.py Proyecto: jenaiz/Crawly

 def post(self):
     key = self.request.get('key')
     url = Url.get_by_key_name(key)
     if url is None:
         page, links = crawler.crawl_web(key)
         def txn():
             url = Url.get_by_key_name(key)
             if url is None:
                 url = Url(key_name=key, uri=key)
                 for l in links:
                     taskqueue.add(queue_name='url-crawler-queue', url='/queue', params={'key': l})                    
             url.put()
         db.run_in_transaction(txn)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test.py Proyecto: tylerCarter/Python

def test():
    print "Testing..."
    from studentMain import LuckySearch
    index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
    ranks = compute_ranks(graph)
    kathleen = 'http://udacity.com/cs101x/urank/kathleen.html'
    nickel = 'http://udacity.com/cs101x/urank/nickel.html'
    arsenic = 'http://udacity.com/cs101x/urank/arsenic.html'
    hummus = 'http://udacity.com/cs101x/urank/hummus.html'
    indexurl = 'http://udacity.com/cs101x/urank/index.html'
    lucky = LuckySearch()
    print lucky.GET('Hummus') == kathleen
    print lucky.GET('the') == nickel
    print lucky.GET('babaganoush') == "Try searchwithpeter.info."
    print "Finished tests."

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test.py Proyecto: DesPenny/cs101

def test_suite():
    print "Testing...\n"
    from studentMain import LuckySearch
    index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
    ranks = compute_ranks(graph)
    kathleen = 'http://udacity.com/cs101x/urank/kathleen.html'
    nickel = 'http://udacity.com/cs101x/urank/nickel.html'
    arsenic = 'http://udacity.com/cs101x/urank/arsenic.html'
    hummus = 'http://udacity.com/cs101x/urank/hummus.html'
    indexurl = 'http://udacity.com/cs101x/urank/index.html'
    lucky = LuckySearch()
    print lucky.GET('Hummus') == kathleen
    print lucky.GET('the') == nickel
    print lucky.GET('babaganoush') == "Try searchwithpeter.info."
    print "\nFinished tests."

Ejemplo n.º 10

0

Mostrar archivo

Archivo: main.py Proyecto: ik2/search-engine

def main():
    name = 'youfollowthefilm'
    fname = name + '.pkl'
    domain = 'http://www.' + name + '.com'
    try:
        with open(fname, 'r') as fout:
            website = pickle.load(fout)
            print "Succesfully read my_site from " + fname
    except:
        website = crawl_web(domain)
        try:
            with open(fname, 'w') as fout:
                pickle.dump(website, fout)
                print "Succesfully wrote my_site to " + fname
        except IOError, e:
            print "Cannot write out my_site: " + str(e)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: main.py Proyecto: ik2/search-engine

def main():
    name = 'youfollowthefilm'
    fname = name + '.pkl'
    domain = 'http://www.' + name + '.com'
    try:
        with open(fname, 'r') as fout:
            website = pickle.load(fout)
            print "Succesfully read my_site from " + fname
    except:
        website = crawl_web(domain)
        try:
            with open(fname, 'w') as fout:
                pickle.dump(website, fout)
                print "Succesfully wrote my_site to " + fname
        except IOError, e:
            print "Cannot write out my_site: " + str(e)

Ejemplo n.º 12

0

Mostrar archivo

def test_engine():
    print "Testing..."
    kathleen = 'http://udacity.com/cs101x/urank/kathleen.html'
    nickel = 'http://udacity.com/cs101x/urank/nickel.html'
    arsenic = 'http://udacity.com/cs101x/urank/arsenic.html'
    hummus = 'http://udacity.com/cs101x/urank/hummus.html'
    indexurl = 'http://udacity.com/cs101x/urank/index.html'

    corpus = crawl_web('http://udacity.com/cs101x/urank/index.html')

    assert lucky_search(corpus, 'Hummus') == kathleen
    assert ordered_search(corpus, 'Hummus') == [kathleen, nickel, arsenic, hummus, indexurl] 
    assert lucky_search(corpus, 'the') == nickel
    assert ordered_search(corpus, 'the') == [nickel, arsenic, hummus, indexurl]
    assert lucky_search(corpus, 'babaganoush') == None
    assert ordered_search(corpus, 'babaganoush') == None
    print "Finished tests."

Ejemplo n.º 13

0

Mostrar archivo

Archivo: Main.py Proyecto: tar07/Python

def test_engine():
    print "Testing..."
    kathleen = 'http://udacity.com/cs101x/urank/kathleen.html'
    nickel = 'http://udacity.com/cs101x/urank/nickel.html'
    arsenic = 'http://udacity.com/cs101x/urank/arsenic.html'
    hummus = 'http://udacity.com/cs101x/urank/hummus.html'
    indexurl = 'http://udacity.com/cs101x/urank/index.html'

    corpus = crawl_web('http://udacity.com/cs101x/urank/index.html')
    fname = 'corpus.pkl'
    
    try:
        with open(fname, 'w') as fout:
            pickle.dump(corpus, fout)
            print "Pickled file to " + fname
    except IOError, e:
        print "Failed to write to file: " + str(e)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: studentmain.py Proyecto: sachinlohith/searchengine

def test_engine():
    print "Testing..."
    kathleen = 'http://udacity.com/cs101x/urank/kathleen.html'
    nickel = 'http://udacity.com/cs101x/urank/nickel.html'
    arsenic = 'http://udacity.com/cs101x/urank/arsenic.html'
    hummus = 'http://udacity.com/cs101x/urank/hummus.html'
    indexurl = 'http://udacity.com/cs101x/urank/index.html'

    corpus = crawl_web('http://udacity.com/cs101x/urank/index.html')

    assert lucky_search(corpus, 'Hummus') == kathleen
    assert ordered_search(corpus, 'Hummus') == [kathleen, nickel, arsenic, hummus, indexurl] 
    assert lucky_search(corpus, 'the') == nickel
    assert ordered_search(corpus, 'the') == [nickel, arsenic, hummus, indexurl]
    assert lucky_search(corpus, 'babaganoush') == None
    assert ordered_search(corpus, 'babaganoush') == None
    print "Finished tests."

Ejemplo n.º 15

0

Mostrar archivo

def test_engine():
    print "Testing..."
    kathleen = 'http://udacity.com/cs101x/urank/kathleen.html'
    nickel = 'http://udacity.com/cs101x/urank/nickel.html'
    arsenic = 'http://udacity.com/cs101x/urank/arsenic.html'
    hummus = 'http://udacity.com/cs101x/urank/hummus.html'
    indexurl = 'http://udacity.com/cs101x/urank/index.html'

    corpus = crawl_web('http://udacity.com/cs101x/urank/index.html')
    fname = 'corpus.pkl'

    # YOUR CODE HERE
    try:
        with open(fname, 'w') as fout:
            pickle.dump(corpus, fout)
            print "Successfully wrote to " + fname
    except IOError, e:
        print "Most odacious! Cannot write to corpus " + str(e)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: studentMain.py Proyecto: ricktan/cs101

def test_engine():
    print "Testing..."
    kathleen = "http://udacity.com/cs101x/urank/kathleen.html"
    nickel = "http://udacity.com/cs101x/urank/nickel.html"
    arsenic = "http://udacity.com/cs101x/urank/arsenic.html"
    hummus = "http://udacity.com/cs101x/urank/hummus.html"
    indexurl = "http://udacity.com/cs101x/urank/index.html"

    corpus = crawl_web("http://udacity.com/cs101x/urank/index.html")
    fname = "corpus.pkl"

    # YOUR CODE HERE

    assert lucky_search(corpus, "Hummus") == kathleen
    assert ordered_search(corpus, "Hummus") == [kathleen, nickel, arsenic, hummus, indexurl]
    assert lucky_search(corpus, "the") == nickel
    assert ordered_search(corpus, "the") == [nickel, arsenic, hummus, indexurl]
    assert lucky_search(corpus, "babaganoush") == None
    assert ordered_search(corpus, "babaganoush") == None
    print "Finished tests."

Ejemplo n.º 17

0

Mostrar archivo

Archivo: test.py Proyecto: chrisdaly/CS101

def test_engine():
    print "Testing..."
    index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
    ranks = compute_ranks(graph)
    kathleen = 'http://udacity.com/cs101x/urank/kathleen.html'
    nickel = 'http://udacity.com/cs101x/urank/nickel.html'
    arsenic = 'http://udacity.com/cs101x/urank/arsenic.html'
    hummus = 'http://udacity.com/cs101x/urank/hummus.html'
    indexurl = 'http://udacity.com/cs101x/urank/index.html'
    # print lucky_search(index, ranks, 'Hummus')
    assert lucky_search(index, ranks, 'Hummus') == kathleen
    #print ordered_search(index, ranks, 'Hummus')
    assert ordered_search(index, ranks, 'Hummus') == [kathleen, nickel, arsenic, hummus, indexurl] 
    #print lucky_search(index, ranks, 'the')
    assert lucky_search(index, ranks, 'the') == nickel
    #print ordered_search(index, ranks, 'the')
    assert ordered_search(index, ranks, 'the') == [nickel, arsenic, hummus, indexurl]
    #print lucky_search(index, ranks, 'babaganoush')
    assert lucky_search(index, ranks, 'babaganoush') == None
    assert ordered_search(index, ranks, 'babaganoush') == None
    print "Finished tests."

Ejemplo n.º 18

0

Mostrar archivo

Archivo: studentMain.py Proyecto: xuweizhixin/Wei-s-crawler

def test_engine():
    print "Testing..."

    keyword = 'star wars'
    if keyword.split() == []:
        print 'Please input something.'
    n = 20
    g = pygoogle(keyword)
    g.pages = 2
    tocrawl = []
    for url in g.get_urls()[0:10]:
        heappush(tocrawl, (-sys.maxint - 1, url))
    crawled = crawl_web(tocrawl, keyword, n)

    #print crawled
    #urls = rank_level_search(corpus, crawled)
    #for url in urls:
    #    print url
    print "Now finish crawling, please check the result in result.txt"

    print "The accururay rate is",test(keyword,n)

    print "Finished tests."

Ejemplo n.º 19

0

Mostrar archivo

Archivo: test.py Proyecto: lipghee/Udacity

def test_engine():
    print "Testing..."
    index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
    ranks = compute_ranks(graph)
    kathleen = 'http://udacity.com/cs101x/urank/kathleen.html'
    nickel = 'http://udacity.com/cs101x/urank/nickel.html'
    arsenic = 'http://udacity.com/cs101x/urank/arsenic.html'
    hummus = 'http://udacity.com/cs101x/urank/hummus.html'
    indexurl = 'http://udacity.com/cs101x/urank/index.html'
    # print lucky_search(index, ranks, 'Hummus')
    assert lucky_search(index, ranks, 'Hummus') == kathleen
    #print ordered_search(index, ranks, 'Hummus')
    assert ordered_search(index, ranks, 'Hummus') == [
        kathleen, nickel, arsenic, hummus, indexurl
    ]
    #print lucky_search(index, ranks, 'the')
    assert lucky_search(index, ranks, 'the') == nickel
    #print ordered_search(index, ranks, 'the')
    assert ordered_search(index, ranks,
                          'the') == [nickel, arsenic, hummus, indexurl]
    #print lucky_search(index, ranks, 'babaganoush')
    assert lucky_search(index, ranks, 'babaganoush') == None
    assert ordered_search(index, ranks, 'babaganoush') == None
    print "Finished tests."

Ejemplo n.º 20

0

Mostrar archivo

Archivo: main.py Proyecto: stefanpreoteasa/python_web_crawler

#Main file

from crawler import crawl_web
from search import lucky_search, ordered_search


from crawler import crawl_web
from search import lucky_search, ordered_search

print "Please enter a seed page:"

seed = raw_input()

corpus = crawl_web(seed)

print "Index\n\n\n:"
print corpus.index
print "\n\n\n"

print "Graph:\n\n\n"
print corpus.graph
print "\n\n\n" 

print "Ranks:\n\n\n"
print corpus.ranks
print "\n\n\n"

Ejemplo n.º 21

0

Mostrar archivo

Archivo: studentMain.py Proyecto: Chuphay/python

# This is the main file you should make changes
#
# To run this code locally and figure out how it works
# please download the code from our GitHub page
# http://udacity.github.io/cs101
# and run the server locally - python studentMain.py
#
from search import lucky_search
from crawler import crawl_web, compute_ranks

corpus, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
ranks = compute_ranks(graph)

class LuckySearch(object):        
    def GET(self, query):
        result = lucky_search(corpus, ranks, query)
        return result

# running some tests
from test import test_suite
test_suite()
    
# This will be executed only if you run this code locally
# using a command: python studentMain.py

if __name__ == "__main__":
    import web
    app = web.application(('/(.*)', 'LuckySearch'), globals())
    corpus, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
    app.run()

Ejemplo n.º 22

0

Mostrar archivo

Archivo: script.py Proyecto: kentowatanabe/python-crawler

# coding: utf-8

import sys,os
sys.path.append(os.pardir)

from crawler import crawl_web
crawl_web('https://ja.wikipedia.org')

Ejemplo n.º 23

0

Mostrar archivo

Archivo: main.py Proyecto: ponchodelosrios98/python_search_engine

def search(environment, seedBase):
  start = clock()
  crawl_web(environment, index, graph, seedBase, 25, 1)
  result = look_up(index, 'PayPal,')
  endTime = clock() - start
  return endTime, result

Ejemplo n.º 24

0

Mostrar archivo

Archivo: main.py Proyecto: andreaxe/pdi

def crawl():
    from crawler import crawl_web

    limite = input("Inserir número máximo de sites a pesquisar: ")
    webpage = 'https://www.eapn.pt/links/governo-da-republica-portuguesa-e-instituicoes-publicas'
    crawl_web(webpage, limit=limite)

Ejemplo n.º 25

0

Mostrar archivo

# Modify this file to make sure that it behaves as asked in the video:
#     if the request is '/about', the server should respond with:
#          'This is my udacious project!'
#     for all other requests, the server should respond with a normal
#     search response.
#
# Hint: you will need to add two strings to the tuple of inputs for
#    web.application(...) as well as define a new class.
#
# To test your code locally you have to install web.py and all of
# these files locally as well.

from search import lucky_search
from crawler import crawl_web, compute_ranks

corpus, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
ranks = compute_ranks(graph)


class LuckySearch(object):
    def GET(self, query):
        result = lucky_search(corpus, ranks, query)
        return result


class About(object):
    def GET(self, query):
        return 'This is my udacious project!'


# This will be executed only if you run this code locally

Ejemplo n.º 26

0

Mostrar archivo

Archivo: Search_Engine.py Proyecto: sam-b/DataSearch


import crawler
import RankAndSearch as core

if name == "__main__":
	print "Welcome to DataSearch!"
	print "Please enter a seed url!"
	seed = raw_input()
	print "Please enter a depth limit"
	depth = raw_input()
	print "Please enter a breadth limit"
	breadth = raw_input()
	index, graph = crawler.crawl_web(seed,depth,breadth)
	ranks = core.compute_ranks(graph)
	while True:
		print "please enter what you wish to search for!"
		term = raw_input()
		print core.search(index,ranks,term)
		print index

Ejemplo n.º 27

0

Mostrar archivo

Archivo: Joegle.py Proyecto: petecummings/JOEGLE_web_crawler

maxc = 50000
while count < maxc:

    print 'crawling web'
    data_file = open('tocrawl_json.json', 'r')
    data = json.load(data_file)

    url = data['tocrawl']
    crawled_data_file = open('crawled_json.json', 'r')
    crawled_data = json.load(crawled_data_file)
    crawled = crawled_data['crawled']
    pages_crawled = len(crawled)
    print pages_crawled
    

    corpus = crawl_web(url,3,crawled)

    print 'crawled'




    ### After the crawler has done its shizzle it sends the data as JSON to my site 
    ### feel free to uncomment but i don't know how much data the server can recieve at once, 
    ### also it might be too much for Mysql to handle at once' 
    
    #json_corpus = corpus.to_JSON()

    #print 'saving corpus to server'
     
    #url = "http://www.josephmohan.co.uk/crawler/json_reciever"

Ejemplo n.º 28

0

Mostrar archivo

Archivo: test.py Proyecto: s4chin/Projects

from crawler import crawl_web
from search import urank, lucky_search, lookup
#test
index, graph = crawl_web('www.dmoz.org') #seed page
ranks = urank(graph)
keyword = raw_input('Enter search keyword: ')
print (lucky_search(index, ranks, keyword))
print (lookup(index, keyword))

#Stuff to add:
#    1. Support for multiple keywords
#    2. Integration with web page
#    3. Removal of useless keywords and punctuation
#    4. Add meaning to punctuation
#    5. Listing of all results according to their rank
#****6. A better page rank algorithm****
#
#-End-

Ejemplo n.º 29

0

Mostrar archivo

Archivo: script.py Proyecto: kentowatanabe/python-crawler

# coding: utf-8

import sys, os
sys.path.append(os.pardir)

from crawler import crawl_web
crawl_web('https://ja.wikipedia.org')

Ejemplo n.º 30

0

Mostrar archivo

Archivo: test.py Proyecto: badersur/webcrawler

#!/usr/bin/python -tt

from crawler import crawl_web, compute_ranks
from search import lucky_search, lookup_best

wc = crawl_web('localhost', 200)
ranks = compute_ranks(wc, 50)

print 'Enter 0 to exit.'
while True:
    user_input = raw_input("Enter: keyword any_number_for_lucky_search [e.g. php]: ")
    if user_input:
        input = user_input.split()
        word = input[0]
        option = 1 if len(input) == 1 else 2
        if word == '0':
            break
        if option == 1:
            results = lookup_best(word, wc, ranks)
            if results:
                for result in results[:25]:
                    print result, '\t'*4, ranks[result]
            else:
                print 'No results!'
        else:
            print lucky_search(word, wc, ranks) or 'Not lucky! -_-'
    print

print '\nThanks for using me!'