コード例 #1
0
ファイル: nadaScraper.py プロジェクト: joed4no/underValueCars
def main():
	# loops through each vehicle Make
	make_soup = tools.openWebPage(base_url + "Cars")
	make_list = make_soup.find(id='makelist')
	for make in make_list.find_all('a'):
		print make.string

		# loops through each Make-Year
		tools.randomSleep()
		year_soup = tools.openWebPage(baseurl + make.get('href'))
		year_list = year_soup.find(id='rtn_content_modelyear_yearlist')
		for year in year_list.find_all('a'):
			# Ignores Make-Year's that specify Used (the same info is captured in the "new" page)
			if 'Used' not in year.get('href'):
				print year.string

				# loops through each Make-Year-Model
				tools.randomSleep()
				model_soup = tools.openWebPage(baseurl + year.get('href'))
				# checks to see if there are specific Model pages
				# older years do not have Model pages and just display Model-Trim info right away
				if model_soup.find(id='rtn_content_models'):
					model_list = model_soup.find(id='rtn_content_models')
					for model in model_list.find_all("tr"):
						# captures the Make-Year-Model info as well as Model description
						(model_info_string, trim_href, model_info_desc, img_string) = getModelInfo(model)
						print "Got " + model_info_string

						getTrimInfo(baseurl + trim_href, year.string, make.string, model_info_string, img_string)
				else:
					getTrimInfo(baseurl + trim_href, year.string, make.string)
コード例 #2
0
ファイル: nadaScraper.py プロジェクト: joed4no/underValueCars
def getTrimInfo(trim_url, year, make, model_string='', img_string=''):
	tools.randomSleep()
	trim_soup = tools.openWebPage(trim_url)
	trim_list = trim_soup.find(id='rtn_content_trims')

	for trim in trim_list.find_all("tr"):
		if trim.find("rtn_content_trims_modelname"):
			full_model_string = trim.find("rtn_content_trims_modelname").string.strip()
			model = full_model_string.split("-")[0]
			sub_model = " ".join(full_model_string.split("-")[1:])
			
			for types in trim.find_all("href"):
				trim_string = sub_model + types.string

				# This is what goes in the database
				print year, make, model, trim_string, img_string
		else:
			remove_string = year + " " + make + " " + model_string
			full_model_string = trim.find("href").string.strip()
			trim_string = full_model_string.replace(remove_string, '')

			# This is what goes in the database
			print year, make, model, trim_string, img_string