def crawl_web(seed): tocrawl = set([seed]) crawled = [] corpus = WebCorpus() while tocrawl: id = tocrawl.pop() if id not in crawled: content = get_page(id) friends = get_all_friends(content) corpus.add_friend(id, friends) tocrawl.update(friends) crawled.append(id) return crawled
def crawl_web(seed): # returns index, graph of inlinks tocrawl = set([seed]) crawled = [] wcorpus = WebCorpus() while tocrawl: url = tocrawl.pop() # changed page to url - clearer name if url not in crawled: content = get_page(url) add_page_to_index(wcorpus.index, url, content) outlinks = get_all_links(content) wcorpus.graph[url] = outlinks tocrawl.update(outlinks) crawled.append(url) return wcorpus
def crawl_web(seed): # returns index, graph of inlinks tocrawl = set([seed]) crawled = [] corpus = WebCorpus() while tocrawl: url = tocrawl.pop() # changed page to url - clearer name if url not in crawled: content = get_page(url) outlinks = get_all_links(content) corpus.add_page(url, content, outlinks) tocrawl.update(outlinks) crawled.append(url) corpus.finish_crawl() return corpus
def crawl_web(seed,max_depth,crawled): # returns index, graph of inlinks" tocrawl = set(seed) next_depth = [] depth = 0 wcorpus = WebCorpus() last_url = None while tocrawl : url = tocrawl.pop() if url not in crawled: try: content = real_get_page(url) if content: words = get_all_words(content) if 'joe' in words or 'Joe' in words or 'joseph' in words or 'Joseph' in words: print url outlinks = get_all_links(content) for outlink in outlinks: wcorpus.add_link(url,outlink) if depth >= max_depth: outlinks = set(outlinks) tocrawl.update(outlinks) tolist = list(tocrawl) write_tocrawl(tolist) write_crawled(crawled) return wcorpus soup = BeautifulSoup(content, "html.parser") try: title = soup.title.string except: title = '' try: desc = str(soup.findAll(attrs={"name":"description"})[0]['content'].encode('utf-8')) except: desc = '' for word in words: wcorpus.add_word_occurrence(url,str(word.encode("UTF-8")),title,desc) tocrawl.update(outlinks) tolist = list(tocrawl) d = {'tocrawl':tolist} with open('tocrawl_json.json', 'w') as outfile: json.dump(d, outfile) crawled.append(url) depth+=1 outlinks = get_all_links(content) crawled.append(url) tocrawl.update(outlinks) except Exception, e: print "ERROR "+str(e)
def crawl_web(seed): # returns index, graph of inlinks tocrawl = set([seed]) crawled = [] wcorpus = WebCorpus() while tocrawl: url = tocrawl.pop() # changed page to url - clearer name if url not in crawled: content = get_page(url) outlinks = get_all_links(content) for outlink in outlinks: wcorpus.add_link(url, outlink) for word in content.split(): wcorpus.add_word_occurrence(url, word) tocrawl.update(outlinks) crawled.append(url) return wcorpus
def crawlWeb(seed): toCrawl = set([seed]) #start with a seed page crawled = [] #keep a record of sites crawled to prevent repeat visits wcorpus = WebCorpus() while toCrawl: url = toCrawl.pop() if url not in crawled: #check whether already crawled content = get_page(url) #read-in all of the page's html text outlinks = getAllLinks(content) #store outlinks in var for building graph for outlink in outlinks: wcorpus.add_link(url, outlink) for word in content.split(): wcorpus.add_word_occurrence(url, word) toCrawl.update(outlinks) #add outlinks to toCrawl stack if we haven't cralwed already crawled.append(url) #store page that we popped in crawled. return wcorpus
def crawl_web(seed): # returns webcorpus (includes index, graph) tocrawl = set([seed]) crawled = [] corpus = WebCorpus() while tocrawl: url = tocrawl.pop() if url not in crawled: content = get_page(url) add_page_to_index(corpus, url, content) outlinks = get_all_links(content) for outlink in outlinks: corpus.add_link(url, outlink) tocrawl.update(outlinks) crawled.append(url) return corpus
def test_engine(): print "Testing..." content = """This is a sample <a href="http://www.example.com">webpage</a> with <a href="http://www.go.to">two links</a> that lead nowhere special. """ outlinks = ["http://www.example.com", "http://www.go.to"] corpus = WebCorpus() assert corpus.lookup("anything") == None for link in outlinks: corpus.add_link("http://www.test.info", link) assert corpus._graph["http://www.test.info"] == outlinks corpus.add_word_occurrence("http://www.test.info", "sample") assert corpus._index["sample"] == ["http://www.test.info"] print "Finished tests."
### ### You should define the WeCorpus class in the file webcorpus.py ### from webcorpus import WebCorpus print "Testing webcorpus..." wc1 = WebCorpus() assert isinstance(wc1.index, dict) assert isinstance(wc1.graph, dict) print "Finished tests."
def select_attribute(NN, JJ): NN = lower(NN) JJ = lower(JJ) search = WebCorpus() # Get counts for the attribute dimensions for both the noun and adjective nn_vector = dict([(att, 0) for att in attributes]) jj_vector = dict([(att, 0) for att in attributes]) for ATTR in attributes: # Collect noun numbers for p in NN_nn_jj_attr_patterns: results = search.get_results(p % (NN, JJ, ATTR)) nn_vector[ATTR] += search.get_count(results) for p in NN_attr_jj_nn_patterns: results = search.get_results(p % (ATTR, JJ, NN)) nn_vector[ATTR] += search.get_count(results) # Collect adjective numbers for p in JJ_jj_attr_patterns: results = search.get_results(p % (JJ, ATTR)) jj_vector[ATTR] += search.get_count(results) for p in JJ_attr_nn_jj_patterns: results = search.get_results(p % (ATTR, NN, JJ)) jj_vector[ATTR] += search.get_count(results) for p in JJ_nn_attr_jj_patterns: results = search.get_results(p % (NN, ATTR, JJ)) jj_vector[ATTR] += search.get_count(results) sel_vector = {} for k,v in nn_vector.items(): sel_vector[k] = v * jj_vector[k] attribute = "" max = 0 for k,v in sel_vector.items(): if v > max: attribute = k return attribute