Python WebCorpusの例

プログラミング言語: Python

名前空間/パッケージ名: webcorpus

クラス/型: WebCorpus

hotexamples.comのコード掲載数: 15

Python WebCorpus - 15件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのwebcorpus.WebCorpusの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

WebCorpus(6)

add_link(3)

add_word_occurrence(2)

add_friend(1)

add_page(1)

finish_crawl(1)

get_count(1)

get_results(1)

lookup(1)

ranks(1)

コード例 #1

ファイルを表示

ファイル: crawler.py プロジェクト: xuweizhixin/WebSearch

def crawl_web(seed):
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()
    while tocrawl:
        id = tocrawl.pop()
        if id not in crawled:
            content = get_page(id)
            friends = get_all_friends(content)
            corpus.add_friend(id, friends)
            tocrawl.update(friends)
            crawled.append(id)
    return crawled

コード例 #2

ファイルを表示

ファイル: crawler.py プロジェクト: DesPenny/cs101

def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl: 
        url = tocrawl.pop() # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(wcorpus.index, url, content)
            outlinks = get_all_links(content)
            wcorpus.graph[url] = outlinks
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus

コード例 #3

ファイルを表示

ファイル: crawler.py プロジェクト: ricktan/cs101

def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            corpus.add_page(url, content, outlinks)
            tocrawl.update(outlinks)
            crawled.append(url)
    corpus.finish_crawl()
    return corpus

コード例 #4

ファイルを表示

ファイル: crawler.py プロジェクト: tylerCarter/Python

def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(wcorpus.index, url, content)
            outlinks = get_all_links(content)
            wcorpus.graph[url] = outlinks
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus

コード例 #5

ファイルを表示

ファイル: crawler.py プロジェクト: petecummings/JOEGLE_web_crawler

def crawl_web(seed,max_depth,crawled): # returns index, graph of inlinks"
    tocrawl = set(seed)
    next_depth = []
    depth = 0
    wcorpus = WebCorpus()
    last_url = None
    while tocrawl :             
        url = tocrawl.pop()
        if url not in crawled:  
            try:
                content = real_get_page(url)
                if content:
		    words = get_all_words(content)
		    if 'joe' in words or 'Joe' in words or 'joseph' in words or 'Joseph' in words:
			print url
			outlinks = get_all_links(content)
			for outlink in outlinks:
			    wcorpus.add_link(url,outlink)
			if depth >= max_depth:
			    outlinks = set(outlinks)
			    tocrawl.update(outlinks)
			    tolist = list(tocrawl)
			    write_tocrawl(tolist)
			    write_crawled(crawled)
			    return wcorpus
			soup = BeautifulSoup(content, "html.parser")
			try:
			    title = soup.title.string
			except:
			    title = ''
			try:
			    desc = str(soup.findAll(attrs={"name":"description"})[0]['content'].encode('utf-8'))
			except:
			    desc = ''
			
			for word in words:
			    wcorpus.add_word_occurrence(url,str(word.encode("UTF-8")),title,desc)
			tocrawl.update(outlinks)
			tolist = list(tocrawl)
			d = {'tocrawl':tolist}
			with open('tocrawl_json.json', 'w') as outfile:
			    json.dump(d, outfile)
			crawled.append(url)
			depth+=1
		    outlinks = get_all_links(content)
		    crawled.append(url)
		    tocrawl.update(outlinks)

            except Exception, e:
                print "ERROR "+str(e)

コード例 #6

ファイルを表示

ファイル: crawler.py プロジェクト: sachinlohith/searchengine

def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl: 
        url = tocrawl.pop() # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            for outlink in outlinks:
                wcorpus.add_link(url, outlink) 
            for word in content.split():
                wcorpus.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus

コード例 #7

ファイルを表示

ファイル: crawler.py プロジェクト: BrianSipple/Python-Search-Engine

def crawlWeb(seed):
	toCrawl = set([seed])		#start with a seed page
	crawled = []			#keep a record of sites crawled to prevent repeat visits
	wcorpus = WebCorpus()
	while toCrawl:
		url = toCrawl.pop()					
		if url not in crawled:				#check whether already crawled
			content = get_page(url)		#read-in all of the page's html text
			outlinks = getAllLinks(content)  #store outlinks in var for building graph
			for outlink in outlinks:
				wcorpus.add_link(url, outlink)
			for word in content.split():
				wcorpus.add_word_occurrence(url, word)
			toCrawl.update(outlinks)		#add outlinks to toCrawl stack if we haven't cralwed already
			crawled.append(url)				#store page that we popped in crawled. 
	return wcorpus

コード例 #8

ファイルを表示

ファイル: crawler.py プロジェクト: lipghee/Udacity

def crawl_web(seed):  # returns webcorpus (includes index, graph)
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()

    while tocrawl:
        url = tocrawl.pop()
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(corpus, url, content)

            outlinks = get_all_links(content)
            for outlink in outlinks:
                corpus.add_link(url, outlink)

            tocrawl.update(outlinks)
            crawled.append(url)

    return corpus

コード例 #9

ファイルを表示

ファイル: crawler.py プロジェクト: cbeltranv/Udacity

def crawl_web(seed): # returns webcorpus (includes index, graph)
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()

    while tocrawl: 
        url = tocrawl.pop()
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(corpus, url, content)
            
            outlinks = get_all_links(content)
            for outlink in outlinks:
                corpus.add_link(url, outlink)

            tocrawl.update(outlinks)
            crawled.append(url)
    
    return corpus

コード例 #10

ファイルを表示

ファイル: studentMain.py プロジェクト: tylerCarter/Python

def test_engine():
    print "Testing..."
    content = """This is a sample <a href="http://www.example.com">webpage</a> with 
    <a href="http://www.go.to">two links</a> that lead nowhere special.
    """
    outlinks = ["http://www.example.com", "http://www.go.to"]

    corpus = WebCorpus()
    assert corpus.lookup("anything") == None
    for link in outlinks:
        corpus.add_link("http://www.test.info", link)
    assert corpus._graph["http://www.test.info"] == outlinks
    corpus.add_word_occurrence("http://www.test.info", "sample")
    assert corpus._index["sample"] == ["http://www.test.info"]

    print "Finished tests."

コード例 #11

ファイルを表示

ファイル: crawler.py プロジェクト: tylerCarter/Python

def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            corpus.add_page(url, content, outlinks)
            tocrawl.update(outlinks)
            crawled.append(url)
    corpus.finish_crawl()
    return corpus

コード例 #12

ファイルを表示

ファイル: studentMain.py プロジェクト: DesPenny/cs101

def test_engine():
    print "Testing..."
    content = """This is a sample <a href="http://www.example.com">webpage</a> with 
    <a href="http://www.go.to">two links</a> that lead nowhere special.
    """
    outlinks = ["http://www.example.com", "http://www.go.to"]
    
    corpus = WebCorpus()
    assert corpus.lookup("anything") == None
    for link in outlinks:
        corpus.add_link("http://www.test.info", link)
    assert corpus._graph["http://www.test.info"] == outlinks
    corpus.add_word_occurrence("http://www.test.info", "sample")
    assert corpus._index["sample"] == ["http://www.test.info"]
    
    print "Finished tests."

コード例 #13

ファイルを表示

def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            for outlink in outlinks:
                wcorpus.add_link(url, outlink)
            for word in content.split():
                wcorpus.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus

コード例 #14

ファイルを表示

###
### You should define the WeCorpus class in the file webcorpus.py
###
from webcorpus import WebCorpus

print "Testing webcorpus..."
wc1 = WebCorpus()
assert isinstance(wc1.index, dict)
assert isinstance(wc1.graph, dict)
print "Finished tests."

コード例 #15

ファイルを表示

ファイル: attribute_selection.py プロジェクト: pschulam-attic/SCLE

def select_attribute(NN, JJ):
    NN = lower(NN)
    JJ = lower(JJ)
    search = WebCorpus()
    
    # Get counts for the attribute dimensions for both the noun and adjective
    nn_vector = dict([(att, 0) for att in attributes])
    jj_vector = dict([(att, 0) for att in attributes])
    for ATTR in attributes:
        # Collect noun numbers
        for p in NN_nn_jj_attr_patterns:
            results = search.get_results(p % (NN, JJ, ATTR))
            nn_vector[ATTR] += search.get_count(results)
        for p in NN_attr_jj_nn_patterns:
            results = search.get_results(p % (ATTR, JJ, NN))
            nn_vector[ATTR] += search.get_count(results)
            
        # Collect adjective numbers
        for p in JJ_jj_attr_patterns:
            results = search.get_results(p % (JJ, ATTR))
            jj_vector[ATTR] += search.get_count(results)
        for p in JJ_attr_nn_jj_patterns:
            results = search.get_results(p % (ATTR, NN, JJ))
            jj_vector[ATTR] += search.get_count(results)
        for p in JJ_nn_attr_jj_patterns:
            results = search.get_results(p % (NN, ATTR, JJ))
            jj_vector[ATTR] += search.get_count(results)
    sel_vector = {}
    for k,v in nn_vector.items():
        sel_vector[k] = v * jj_vector[k]
    attribute = ""
    max = 0
    for k,v in sel_vector.items():
        if v > max:
            attribute = k
    return attribute