コード例 #1
0
ファイル: namechar_crawler.py プロジェクト: heevery/webid
	def scrape(self):
		self.driver.get('http://www.erumy.com/nameAnalyze/eDefault.aspx')
		print self.driver.title
		
		WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'IFRAMENAMERESULT')))
		self.driver.switch_to.frame('IFRAMENAMERESULT')
		# WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'aspnetForm')))
		
		with open('kornamechar.txt','r') as f:
			lines = f.readlines()
			namechars = [line.strip() for line in lines]

		self.sqldb = StoreNameChar('namechar')
		for char in namechars:
			try:
				namechar = self.scrape_namechar_stat(char)
				print namechar.db_record_form()
				self.sqldb.store(namechar)
				self.sqldb.commit()
			except TimeoutException:
				print "%s is not scraped" % namechar
				continue
		
		self.sqldb.close()
		self.driver.switch_to.default_content()
コード例 #2
0
ファイル: namechar_crawler.py プロジェクト: heevery/webid
class NameCharScraper(object):
	def __init__(self):
		self.driver = webdriver.PhantomJS()
	
	def close(self):
		self.driver.close()
		self.driver = None
	
	def scrape(self):
		self.driver.get('http://www.erumy.com/nameAnalyze/eDefault.aspx')
		print self.driver.title
		
		WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'IFRAMENAMERESULT')))
		self.driver.switch_to.frame('IFRAMENAMERESULT')
		# WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'aspnetForm')))
		
		with open('kornamechar.txt','r') as f:
			lines = f.readlines()
			namechars = [line.strip() for line in lines]

		self.sqldb = StoreNameChar('namechar')
		for char in namechars:
			try:
				namechar = self.scrape_namechar_stat(char)
				print namechar.db_record_form()
				self.sqldb.store(namechar)
				self.sqldb.commit()
			except TimeoutException:
				print "%s is not scraped" % namechar
				continue
		
		self.sqldb.close()
		self.driver.switch_to.default_content()
	
	def scrape_namechar_stat(self, korchar):
		try:
			searchbox_id = 'ctl00_ContentPlaceHolder1_idSearchBox'
			searchbox = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, searchbox_id)))
			name = ''.join(['김', korchar, korchar])
			script = "document.getElementById('%s').value='%s';" % (searchbox_id, name.decode('utf-8'))
			self.driver.execute_script(script)
			searchbox.submit()

			WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'name_info')))
		except TimeoutException:
			raise
		
		soup = BS(self.driver.page_source, 'html.parser')
		name_info = soup.find(id="name_info")
		char_info_text = name_info.find_all('table')[3].find('tbody').find('tr').find('td').text

		infos = char_info_text.encode('utf-8').strip().replace('%','').split('\n')
		infos = [line.strip() for line in infos]
		
		m_rank = re.findall('(?<=\s)[0-9]*(?=번째)', infos[0])[0]
		m_portion = re.findall('(?<=\()[0-9.]*(?=\))', infos[0])[0]
		[m1_rank, m2_rank] = re.findall('(?<=\s)[0-9]*(?=번째)', infos[1])
		[m1_portion, m2_portion] = re.findall('(?<=\()[0-9.]*(?=\))', infos[1])

		f_rank = re.findall('(?<=\s)[0-9]*(?=번째)', infos[3])[0]
		f_portion = re.findall('(?<=\()[0-9.]*(?=\))', infos[3])[0]
		[f1_rank, f2_rank] = re.findall('(?<=\s)[0-9]*(?=번째)', infos[4])
		[f1_portion, f2_portion] = re.findall('(?<=\()[0-9.]*(?=\))', infos[4])

		namechar = NameChar(korchar, m_rank, m_portion, m1_rank, m1_portion, m2_rank, m2_portion, f_rank, f_portion, f1_rank, f1_portion, f2_rank, f2_portion)
		return namechar