Esempio n. 1
0
def get_context(url, matchtext, before, after):
    html = get_cached_url(url).read()
    textsegments = html_to_text(html)
    i = textsegments.find(matchtext)
    bigtext = textsegments[max(0, i - before):min(i +
                                                  after, len(textsegments))]
    return trim_to_words(bigtext)
Esempio n. 2
0
def get_boss(query, start=0, count=10):
    url = get_boss_url(query, start, count)
    dom = XML(uc.get_cached_url("boss", url, pause=True))
    realstart = dom.find("resultset_web").attr("start")
    if int(realstart) == start:
        return dom.findAll("result")
    else:
        return None
Esempio n. 3
0
def get_boss(query,start=0,count=10):
	url = get_boss_url(query,start,count)
	dom = XML(uc.get_cached_url("boss",url))
	realstart = dom.find("resultset_web").attr("start")
	if int(realstart) == start:
		return dom.findAll("result")
	else:
		return None
Esempio n. 4
0
	def run(self):
		global totaldownloaded
		global totalfiles
		global urls
		global timeouts 
		print "thread running"
		while len(urls) > 0:
			url = urls.pop()
			if url.endswith("pdf"): continue
			try:
				content = uc.get_cached_url("pages",url,400000,2).read()			
				totaldownloaded += len(content)
				totalfiles += 1
				if totalfiles % 10 == 0:
					print "size:",len(content),"avg:",(totaldownloaded/totalfiles),"tot:",totaldownloaded,"cnt:",totalfiles,"tmo:",timeouts,"url:",url[:50]
			except:	
				timeouts += 1	
Esempio n. 5
0
 def run(self):
     global totaldownloaded
     global totalfiles
     global urls
     global timeouts
     print "thread running"
     while len(urls) > 0:
         url = urls.pop()
         if url.endswith("pdf"): continue
         try:
             content = uc.get_cached_url("pages", url, 400000, 2).read()
             totaldownloaded += len(content)
             totalfiles += 1
             if totalfiles % 10 == 0:
                 print "size:", len(content), "avg:", (
                     totaldownloaded / totalfiles
                 ), "tot:", totaldownloaded, "cnt:", totalfiles, "tmo:", timeouts, "url:", url[:
                                                                                               50]
         except:
             timeouts += 1
Esempio n. 6
0
def get_context(url,matchtext,before,after):
	html = get_cached_url(url).read()
	textsegments = html_to_text(html)
	i =  textsegments.find(matchtext)
	bigtext = textsegments[max(0,i-before):min(i+after,len(textsegments))]
	return trim_to_words(bigtext)
Esempio n. 7
0
def boss_counts_for_pattern(pattern):
    """get the total number of hits for a pattern, and also download the first 50"""
    url = boss.get_boss_url(pattern, 0, 50)
    dom = XML(uc.get_cached_url("boss", url))
    hitcount = dom.find("resultset_web").attr("totalhits")
    return int(hitcount)
Esempio n. 8
0
def boss_counts_for_pattern(pattern):
	"""get the total number of hits for a pattern, and also download the first 50"""
	url = boss.get_boss_url('"'+pattern+'"',0,50)
	dom = XML(uc.get_cached_url("boss",url))
	hitcount = dom.find("resultset_web").attr("deephits")
	return int(hitcount)