def scrape(self): self.driver.get('http://www.erumy.com/nameAnalyze/eDefault.aspx') print self.driver.title WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'IFRAMENAMERESULT'))) self.driver.switch_to.frame('IFRAMENAMERESULT') # WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'aspnetForm'))) with open('kornamechar.txt','r') as f: lines = f.readlines() namechars = [line.strip() for line in lines] self.sqldb = StoreNameChar('namechar') for char in namechars: try: namechar = self.scrape_namechar_stat(char) print namechar.db_record_form() self.sqldb.store(namechar) self.sqldb.commit() except TimeoutException: print "%s is not scraped" % namechar continue self.sqldb.close() self.driver.switch_to.default_content()
class NameCharScraper(object): def __init__(self): self.driver = webdriver.PhantomJS() def close(self): self.driver.close() self.driver = None def scrape(self): self.driver.get('http://www.erumy.com/nameAnalyze/eDefault.aspx') print self.driver.title WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'IFRAMENAMERESULT'))) self.driver.switch_to.frame('IFRAMENAMERESULT') # WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'aspnetForm'))) with open('kornamechar.txt','r') as f: lines = f.readlines() namechars = [line.strip() for line in lines] self.sqldb = StoreNameChar('namechar') for char in namechars: try: namechar = self.scrape_namechar_stat(char) print namechar.db_record_form() self.sqldb.store(namechar) self.sqldb.commit() except TimeoutException: print "%s is not scraped" % namechar continue self.sqldb.close() self.driver.switch_to.default_content() def scrape_namechar_stat(self, korchar): try: searchbox_id = 'ctl00_ContentPlaceHolder1_idSearchBox' searchbox = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, searchbox_id))) name = ''.join(['김', korchar, korchar]) script = "document.getElementById('%s').value='%s';" % (searchbox_id, name.decode('utf-8')) self.driver.execute_script(script) searchbox.submit() WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.ID, 'name_info'))) except TimeoutException: raise soup = BS(self.driver.page_source, 'html.parser') name_info = soup.find(id="name_info") char_info_text = name_info.find_all('table')[3].find('tbody').find('tr').find('td').text infos = char_info_text.encode('utf-8').strip().replace('%','').split('\n') infos = [line.strip() for line in infos] m_rank = re.findall('(?<=\s)[0-9]*(?=번째)', infos[0])[0] m_portion = re.findall('(?<=\()[0-9.]*(?=\))', infos[0])[0] [m1_rank, m2_rank] = re.findall('(?<=\s)[0-9]*(?=번째)', infos[1]) [m1_portion, m2_portion] = re.findall('(?<=\()[0-9.]*(?=\))', infos[1]) f_rank = re.findall('(?<=\s)[0-9]*(?=번째)', infos[3])[0] f_portion = re.findall('(?<=\()[0-9.]*(?=\))', infos[3])[0] [f1_rank, f2_rank] = re.findall('(?<=\s)[0-9]*(?=번째)', infos[4]) [f1_portion, f2_portion] = re.findall('(?<=\()[0-9.]*(?=\))', infos[4]) namechar = NameChar(korchar, m_rank, m_portion, m1_rank, m1_portion, m2_rank, m2_portion, f_rank, f_portion, f1_rank, f1_portion, f2_rank, f2_portion) return namechar