Python crawl_subject Examples

Programming Language: Python

Namespace/Package Name: trunk

Method/Function: crawl_subject

Examples at hotexamples.com: 2

Python crawl_subject - 2 examples found. These are the top rated real world Python examples of trunk.crawl_subject extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: checklog.py Project: Lufay/crawl_seed

def check_dir(dir_name, logfile_name):
	os.chdir(dir_name)
	try:
		f = open('index.log')
	except IOError:
		print "can't open index.log in %s\n" % dir_name
		sys.exit(1)
	ret = True
	gl = trunk.GetFileLine(f, 200)
	firstline, secondline = gl.get_first(2)
	lastlines = gl.get_last(10)
	if lastlines[0] == 'Success':
		print "%s success" % dir_name
		if clf and not clf.has_download(firstline):
			clf.write([firstline, secondline, dir_name])
			clf.write("\n")
	else:
		print "last line of index.log in %s is:" % dir_name
		print `lastlines[0]`
		logfile = sys.stdout
		if logfile_name:
			logfile = open(logfile_name, 'w+')
		if lastlines[0].startswith('Refresh this page'):
			url_head = url_head_pattern.search(lastlines[0]).group()
			ret = hash_download(url_head, lastlines[1:], logfile)
		elif lastlines[0] == 'Checked fail: Refresh this page' or \
			 lastlines[0] == "Download retry Failed" or \
			 lastlines[0].startswith('No such file'):
			retry_hash_download(gl, logfile=logfile)
		elif lastlines[0].startswith('Error: open page '):
			ret = check_res(trunk.crawl_subject(firstline, logfile=logfile))
		elif lastlines[0].startswith('Error: not find dowload path in'):
			ret = check_res(trunk.crawl_subject(firstline, 0, logfile))
		else:
			if dir_name in failed_dict:
				print '%s has checked failed, info:\n%s' % (dir_name, failed_dict[dir_name])
			choice = raw_input('expect lastline or retry?[y/r/n]')
			if choice.startswith('r'):
				retry_hash_download(gl, logfile=logfile)
			ret = choice.startswith('y')
	f.close()
	os.chdir('..')
	return ret

Example #2

Show file

File: findlost.py Project: Lufay/crawl_seed

for id in xrange(beginID, endID):
	sub_url = 'htm_data/2/1602/1%d.html' % id
	url = trunk.domain + sub_url
	content = trunk.open_page(url, 4)
	if not content:
		print '%s open failed\n' % sub_url
		continue
	soup = BeautifulSoup(content, from_encoding='gbk')
	title = unicode(soup.title.string)
	title_end_pos = title.find(title_end)
	title = title[:title_end_pos]
	encode_title = str(title.encode('gb18030'))

	now = str(time.time())
	os.mkdir(now)
	os.chdir(now)
	logfile = open('index.log', 'w+')
	logfile.write("%s\n" % sub_url)
	logfile.write("%s\n" % encode_title)
	logfile.write("\n")
	res_tuple = trunk.crawl_subject(sub_url, logfile=logfile)
	if res_tuple[0]:
		clf.write([sub_url, encode_title, now])
	else:
		clf.write([res_tuple[1], sub_url, encode_title, now])
	clf.write("\n")
	logfile.close()
	os.chdir('..')
	clf.flush()
clf.close()