コード例 #1
0
ファイル: crawler.py プロジェクト: xuweizhixin/WebSearch
def crawl_web(seed):
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()
    while tocrawl:
        id = tocrawl.pop()
        if id not in crawled:
            content = get_page(id)
            friends = get_all_friends(content)
            corpus.add_friend(id, friends)
            tocrawl.update(friends)
            crawled.append(id)
    return crawled
コード例 #2
0
ファイル: crawler.py プロジェクト: DesPenny/cs101
def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl: 
        url = tocrawl.pop() # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(wcorpus.index, url, content)
            outlinks = get_all_links(content)
            wcorpus.graph[url] = outlinks
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus
コード例 #3
0
ファイル: crawler.py プロジェクト: ricktan/cs101
def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            corpus.add_page(url, content, outlinks)
            tocrawl.update(outlinks)
            crawled.append(url)
    corpus.finish_crawl()
    return corpus
コード例 #4
0
ファイル: crawler.py プロジェクト: tylerCarter/Python
def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(wcorpus.index, url, content)
            outlinks = get_all_links(content)
            wcorpus.graph[url] = outlinks
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus
コード例 #5
0
def crawl_web(seed,max_depth,crawled): # returns index, graph of inlinks"
    tocrawl = set(seed)
    next_depth = []
    depth = 0
    wcorpus = WebCorpus()
    last_url = None
    while tocrawl :             
        url = tocrawl.pop()
        if url not in crawled:  
            try:
                content = real_get_page(url)
                if content:
		    words = get_all_words(content)
		    if 'joe' in words or 'Joe' in words or 'joseph' in words or 'Joseph' in words:
			print url
			outlinks = get_all_links(content)
			for outlink in outlinks:
			    wcorpus.add_link(url,outlink)
			if depth >= max_depth:
			    outlinks = set(outlinks)
			    tocrawl.update(outlinks)
			    tolist = list(tocrawl)
			    write_tocrawl(tolist)
			    write_crawled(crawled)
			    return wcorpus
			soup = BeautifulSoup(content, "html.parser")
			try:
			    title = soup.title.string
			except:
			    title = ''
			try:
			    desc = str(soup.findAll(attrs={"name":"description"})[0]['content'].encode('utf-8'))
			except:
			    desc = ''
			
			for word in words:
			    wcorpus.add_word_occurrence(url,str(word.encode("UTF-8")),title,desc)
			tocrawl.update(outlinks)
			tolist = list(tocrawl)
			d = {'tocrawl':tolist}
			with open('tocrawl_json.json', 'w') as outfile:
			    json.dump(d, outfile)
			crawled.append(url)
			depth+=1
		    outlinks = get_all_links(content)
		    crawled.append(url)
		    tocrawl.update(outlinks)

            except Exception, e:
                print "ERROR "+str(e)
コード例 #6
0
ファイル: crawler.py プロジェクト: sachinlohith/searchengine
def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl: 
        url = tocrawl.pop() # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            for outlink in outlinks:
                wcorpus.add_link(url, outlink) 
            for word in content.split():
                wcorpus.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus
コード例 #7
0
def crawlWeb(seed):
	toCrawl = set([seed])		#start with a seed page
	crawled = []			#keep a record of sites crawled to prevent repeat visits
	wcorpus = WebCorpus()
	while toCrawl:
		url = toCrawl.pop()					
		if url not in crawled:				#check whether already crawled
			content = get_page(url)		#read-in all of the page's html text
			outlinks = getAllLinks(content)  #store outlinks in var for building graph
			for outlink in outlinks:
				wcorpus.add_link(url, outlink)
			for word in content.split():
				wcorpus.add_word_occurrence(url, word)
			toCrawl.update(outlinks)		#add outlinks to toCrawl stack if we haven't cralwed already
			crawled.append(url)				#store page that we popped in crawled. 
	return wcorpus	
コード例 #8
0
ファイル: crawler.py プロジェクト: lipghee/Udacity
def crawl_web(seed):  # returns webcorpus (includes index, graph)
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()

    while tocrawl:
        url = tocrawl.pop()
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(corpus, url, content)

            outlinks = get_all_links(content)
            for outlink in outlinks:
                corpus.add_link(url, outlink)

            tocrawl.update(outlinks)
            crawled.append(url)

    return corpus
コード例 #9
0
ファイル: crawler.py プロジェクト: cbeltranv/Udacity
def crawl_web(seed): # returns webcorpus (includes index, graph)
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()

    while tocrawl: 
        url = tocrawl.pop()
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(corpus, url, content)
            
            outlinks = get_all_links(content)
            for outlink in outlinks:
                corpus.add_link(url, outlink)

            tocrawl.update(outlinks)
            crawled.append(url)
    
    return corpus
コード例 #10
0
ファイル: studentMain.py プロジェクト: tylerCarter/Python
def test_engine():
    print "Testing..."
    content = """This is a sample <a href="http://www.example.com">webpage</a> with 
    <a href="http://www.go.to">two links</a> that lead nowhere special.
    """
    outlinks = ["http://www.example.com", "http://www.go.to"]

    corpus = WebCorpus()
    assert corpus.lookup("anything") == None
    for link in outlinks:
        corpus.add_link("http://www.test.info", link)
    assert corpus._graph["http://www.test.info"] == outlinks
    corpus.add_word_occurrence("http://www.test.info", "sample")
    assert corpus._index["sample"] == ["http://www.test.info"]

    print "Finished tests."
コード例 #11
0
ファイル: crawler.py プロジェクト: tylerCarter/Python
def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            corpus.add_page(url, content, outlinks)
            tocrawl.update(outlinks)
            crawled.append(url)
    corpus.finish_crawl()
    return corpus
コード例 #12
0
ファイル: studentMain.py プロジェクト: DesPenny/cs101
def test_engine():
    print "Testing..."
    content = """This is a sample <a href="http://www.example.com">webpage</a> with 
    <a href="http://www.go.to">two links</a> that lead nowhere special.
    """
    outlinks = ["http://www.example.com", "http://www.go.to"]
    
    corpus = WebCorpus()
    assert corpus.lookup("anything") == None
    for link in outlinks:
        corpus.add_link("http://www.test.info", link)
    assert corpus._graph["http://www.test.info"] == outlinks
    corpus.add_word_occurrence("http://www.test.info", "sample")
    assert corpus._index["sample"] == ["http://www.test.info"]
    
    print "Finished tests."
コード例 #13
0
def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            for outlink in outlinks:
                wcorpus.add_link(url, outlink)
            for word in content.split():
                wcorpus.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus
コード例 #14
0
###
### You should define the WeCorpus class in the file webcorpus.py
###
from webcorpus import WebCorpus

print "Testing webcorpus..."
wc1 = WebCorpus()
assert isinstance(wc1.index, dict)
assert isinstance(wc1.graph, dict)
print "Finished tests."
        
コード例 #15
0
def select_attribute(NN, JJ):
    NN = lower(NN)
    JJ = lower(JJ)
    search = WebCorpus()
    
    # Get counts for the attribute dimensions for both the noun and adjective
    nn_vector = dict([(att, 0) for att in attributes])
    jj_vector = dict([(att, 0) for att in attributes])
    for ATTR in attributes:
        # Collect noun numbers
        for p in NN_nn_jj_attr_patterns:
            results = search.get_results(p % (NN, JJ, ATTR))
            nn_vector[ATTR] += search.get_count(results)
        for p in NN_attr_jj_nn_patterns:
            results = search.get_results(p % (ATTR, JJ, NN))
            nn_vector[ATTR] += search.get_count(results)
            
        # Collect adjective numbers
        for p in JJ_jj_attr_patterns:
            results = search.get_results(p % (JJ, ATTR))
            jj_vector[ATTR] += search.get_count(results)
        for p in JJ_attr_nn_jj_patterns:
            results = search.get_results(p % (ATTR, NN, JJ))
            jj_vector[ATTR] += search.get_count(results)
        for p in JJ_nn_attr_jj_patterns:
            results = search.get_results(p % (NN, ATTR, JJ))
            jj_vector[ATTR] += search.get_count(results)
    sel_vector = {}
    for k,v in nn_vector.items():
        sel_vector[k] = v * jj_vector[k]
    attribute = ""
    max = 0
    for k,v in sel_vector.items():
        if v > max:
            attribute = k
    return attribute