Python Document.download Examples

Programming Language: Python

Namespace/Package Name: document

Class/Type: Document

Method/Function: download

Examples at hotexamples.com: 1

Python Document.download - 1 examples found. These are the top rated real world Python examples of document.Document.download extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Document(30)

__init__(9)

new_shape(5)

content(4)

apply_op(4)

snapshot(4)

delete(3)

classification(3)

getWords(3)

concatenate(3)

add_header(2)

preprocess_text(2)

getName(2)

calc_hash(2)

getNumDifferentWords(2)

getNumParagraphs(2)

add_predictions(2)

getNumTotalWords(2)

add_tag(2)

getWordCount(2)

getNumSentences(2)

copy(2)

create_or_update(2)

addCut(2)

add(2)

result_file(2)

create(2)

date(2)

render(2)

getPostingsList(1)

download(1)

empty(1)

getUrls(1)

getTopWords(1)

getMostFrequentWord(1)

end_user_action(1)

files(1)

genre(1)

getClause(1)

getClausesList(1)

getImage(1)

getMostFrequentWords(1)

ChangePage(1)

get_selection(1)

getWordsAboveFrequency(1)

remove(1)

tokenize(1)

subscribe_user(1)

statistics(1)

setTextPath(1)

Example #1

Show file

File: main.py Project: spsu/forager

def main(url):
	global DB
	global RQ

	doc = Document(url)
	RQ.push(doc)
	#DB[url] = Document(url)

	try:
		count = 0
		while not RQ.empty():
			doc = RQ.pop()
			url = doc.url

			print "Url '%s' dequeued." % doc.url

			# Don't fetch again if in database.
			if doc.url in DB:
				continue

			DB[url] = doc

			print "Downloading..."
			doc.download()

			# If we just downloaded an external domain, we 
			# don't continue to spider it.
			if not url.isOnDomain('spsu.edu'):
				continue

			if doc.isMissing():
				continue

			urls = doc.getUrls()
			print "%d urls parsed from page" % len(urls)

			for u in urls:
				if u not in DB:
					d = Document(u)
					d.linksIn.append(doc)
					RQ.push(d, 1) # TODO: priority heuristic
				else:
					d = DB[u]
					d.linksIn.append(doc)

				doc.linksOut.append(d)

			count += 1
			if count % SAVE_EVERY == 0:
				save_database()
				count = 1

	except KeyboardInterrupt:
		sys.exit()
		print "Keybord Interrupt, spider terminating."
		save_queue() # XXX This should be fixed.
		return

	except Exception as e:
		import sys, traceback
		print '\n---------------------'
		print "Exception occurred in mainloop"
		print 'Exception: %s' % e
		print '- - - - - - - - - - -'
		traceback.print_tb(sys.exc_info()[2])
		print "\n"
		pass