Ejemplo n.º 1
0
	def scrape_names(self, names):
		self.driver.get('http://www.erumy.com/nameAnalyze/eDefault.aspx')
		print self.driver.title

		WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'IFRAMENAMERESULT')))
		self.driver.switch_to.frame('IFRAMENAMERESULT')
		# WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'aspnetForm')))
	
		self.sqldb = StoreNameStat('namestat')
		for i, name in enumerate(names):
			try:
				namestat = self.scrape_name_stat(name)
				if namestat is None:
					print "no info for %s" % name
					continue
				print namestat
				self.sqldb.store(namestat)
				self.sqldb.commit()
			except TimeoutException:
				print "%s is not scraped" % name
				continue

		self.sqldb.close()
		self.driver.switch_to.default_content()
Ejemplo n.º 2
0
class NameStatScraper(object):
	def __init__(self):
		self.driver = webdriver.PhantomJS()

	def close(self):
		self.driver.quit()
		self.driver = None
	
	def reopen(self):
		try:
			self.driver.close()
		except URLError:
			print "the window does not respond"
			self.driver.quit()

		time.sleep(5)
		self.driver = webdriver.PhantomJS()
		print "reopen new webdriver"
	
	def scrape(self):
		with sqlite3.connect("webid.db") as conn:
			cursor = conn.cursor()
			cursor.execute('select korchar from namechar where male_rank + female_rank > 0')
			tuples = cursor.fetchall()
			namechars = [tup[0].encode('utf-8') for tup in tuples]
		# with open('kornamechar.txt', 'r') as f:
		# 	lines = f.readlines()
		# 	namechars = [line.strip() for line in lines]

		# namechars = ['희', '준', '범']
		# namechars = ['김', '팍', '범']
		names = list()
		flag = False
		for char in namechars:
			names.append(char + char)

		batch_size = 1000 
		num_batch = int(math.ceil(float(len(names))/batch_size))
		print len(names), num_batch
		for batch_i in range(num_batch):
			start_idx = batch_i * batch_size 
			end_idx = min(start_idx + batch_size, len(names))
			self.scrape_names(names[start_idx:end_idx])

			self.reopen()

	def scrape_names(self, names):
		self.driver.get('http://www.erumy.com/nameAnalyze/eDefault.aspx')
		print self.driver.title

		WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'IFRAMENAMERESULT')))
		self.driver.switch_to.frame('IFRAMENAMERESULT')
		# WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'aspnetForm')))
	
		self.sqldb = StoreNameStat('namestat')
		for i, name in enumerate(names):
			try:
				namestat = self.scrape_name_stat(name)
				if namestat is None:
					print "no info for %s" % name
					continue
				print namestat
				self.sqldb.store(namestat)
				self.sqldb.commit()
			except TimeoutException:
				print "%s is not scraped" % name
				continue

		self.sqldb.close()
		self.driver.switch_to.default_content()

	def scrape_name_stat(self, name):
		try:
			searchbox_id = 'ctl00_ContentPlaceHolder1_idSearchBox'
			searchbox = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, searchbox_id)))
			fullname = ''.join(['김', name])
			script = "document.getElementById('%s').value='%s';" % (searchbox_id, fullname.decode('utf-8'))
			self.driver.execute_script(script)
			searchbox.submit()

			WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'name_info')))
		except TimeoutException:
			raise

		soup = BS(self.driver.page_source, 'html.parser')
		name_info = soup.find(id="name_info")
		namestat_trs = name_info.find_all('table')[1].find('tbody').find_all('tr')

		preference = namestat_trs[0].find('td').text.encode('utf-8').strip()
		gender_likely = namestat_trs[2].find('td').text.encode('utf-8').strip()
		birth_info = namestat_trs[3].find('td').text.encode('utf-8').strip()

		gender_likely_lines = gender_likely.split('\n')
		gender_likely_line2 = gender_likely_lines[2].strip()
		gender_likely_line3 = gender_likely_lines[3].strip()
		
		prob = re.findall('(?<=\s)[0-9.]*(?=%)', gender_likely_line2)[0]
		gender = re.findall('(?<=\s)[가-힣]*(?=성적인)', gender_likely_line2)[0]
		m_freq, f_freq = re.findall('(?<=\s)[0-9,]*(?=번)', gender_likely_line3)
		if gender == '남':
			f_p = 1 - float(prob)/100
		elif gender == '여':
			f_p = float(prob)/100
		else:
			return None
		m_freq = int(m_freq.replace(',', ''))
		f_freq = int(f_freq.replace(',', ''))

		rank = safe_list_get(re.findall('(?<=\s)[0-9]*(?=번째)', preference), 0, '-1')
		rank = int(rank.replace(',', ''))
		proportion = float(safe_list_get(re.findall('(?<=\()[0-9.]*(?=%)', preference), 0, 0))

		freq_year = int(re.findall('(?<=\s)[0-9]*(?=년도에)', birth_info)[0])
		freq_region = re.findall('(?<=\s)[가-힣]*(?=\s출생인)', birth_info)[0]
		freq_sur = re.findall('(?<=\s)[가-힣]*(?=씨입니다)', preference)[0]

		namestat = NameStat(name, rank, proportion, f_p, m_freq, f_freq, freq_year, freq_region, freq_sur)
		return namestat