def store_nene(data): table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/nene_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_bbq( err=lambda e: print('%s : %s' % (e, datetime.now()), file=sys.stderr)): results = [] url = 'https://www.bbq.co.kr/page/order/store-search_left.asp?lat=37.491872&lng=127.115922&schval=%s' % ( urllib.parse.quote('점')) html = crawling(url=url) try: bs = BeautifulSoup(html, 'html.parser') tags_div = bs.findAll('div', attrs={'class': 'storeNearyByItem-title'}) items = bs.findAll('div', attrs={'class': 'storeNearyByItem-address'}) for i in range(len(tags_div)): name = tags_div[i].find('span').text address = items[i].text.strip() sido = address.split()[0] gungu = address.split()[1] results.append((name, address, sido, gungu)) except AttributeError as e: err(e) table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/bbq_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_pelicana(): results = [] for page in count(start=1): url = 'http://www.pelicana.co.kr/store/stroe_search.html?gu=&si=&page=%d' % page html = cw.crawling(url=url) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class': 'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') # 끝 검출 if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[3] sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) #store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_kyochon(): results = [] for sido in range(1, 18): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d' % (sido, sido2) html = cw.crawling(url=url) if html is None: break bs = BeautifulSoup(html, 'html.parser') tag_li = bs.find('div', attrs={'class': 'shopSchList'}) tags_dl = tag_li.findAll('dl') for tag_dl in tags_dl: strings = list(tag_dl.strings) if strings[0] == '검색결과가 없습니다.': break else: name = strings[1] address = strings[3].strip() sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_goobne(): url = 'http://www.goobne.co.kr/store/search_store.jsp' #첫 페이지 로딩 wd = webdriver.Chrome( 'D:\PycharmProjects\chromedriver_win32\chromedriver.exe') wd.get(url) time.sleep(5) # print(wd.page_source) results = [] for page in count(start=1): #자바스크립트 실행 script = 'store.getList(%d)' % page wd.execute_script(script) # 실행 print('%s : success for script execute [%s]' % (datetime.now(), script)) time.sleep(5) # 실행결과 HTML(rendering된 HTML) 가져오기 html = wd.page_source # parsing with bs4 bs = BeautifulSoup(html, 'html.parser') tag_tbody = bs.find('tbody', attrs={'id': 'store_list'}) tags_tr = tag_tbody.findAll('tr') #s붙이면 리스트로 된다. # print(tag_tbody) #마지막 검출 if tags_tr[0].get('class') is None: break for tag_tr in tags_tr: strings = list(tag_tr.strings) # print(strings) name = strings[1] address = strings[6] sidogu = address.split()[:2] #어드레스에서 슬라이싱 해서 뽑아내야한다. results.append((name, address) + tuple(sidogu)) print(results) #store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) #리턴값을 세팅 table['gungu'] = table.sido.apply( lambda v: gungu_dict.get(v, v)) # 리턴값을 세팅 table.to_csv('{0}/goobne_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_cu(): url = 'http://cu.bgfretail.com/store/list.do?category=store' wd = webdriver.Chrome('D:/bigdata/chromedriver/chromedriver.exe') wd.get(url) # time.sleep(2) results = [] for page in range(1, 201): script = 'newsPage(%d)' % page wd.execute_script(script) # 스크립트 실행 print('%s : success for script execute [%s]' % (datetime.now(), script)) time.sleep(1) html = wd.page_source # print(html) bs = BeautifulSoup(html, 'html.parser') tag_div = bs.find('div', attrs={'class':'detail_store'}) tag_tbody = tag_div.find('tbody') # print("tag_tbody === ", tag_tbody) tags_tr = tag_tbody.findAll('tr') # print("tag_tr === ", tags_tr) # 마지막 검출 if tags_tr == []: print("끄~~~~~~~~~~~~~~~~~~읕!!!!!!!!!!!") break for tag_tr in tags_tr: strings = list(tag_tr.strings) # print("strings === ",strings) name = strings[2] phone = strings[4] address = strings[10] sidogu = address.split()[:2] results.append((name, address, phone) + tuple(sidogu)) print(results) # store table = pd.DataFrame(results, columns=['name', 'address','phone', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv( # 파일 저장 '{0}/cu_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_kyochon(): results = [] for sido1 in range(1,18): #for sido1 in range(1, 5): for sido2 in count(start=1): #for sido2 in range(2,20): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % (sido1,sido2) html = cw.crawling(url=url) if html is None: break bs = BeautifulSoup(html, 'html.parser') tag_ul = bs.find('ul', attrs={'class':'list'}) tag_li = tag_ul.find('li') tag_a = tag_li.find('a') tag_dl = tag_a.findAll('dl') #print('%s : sucess for script execute [%s]' % (datetime.now(), tag_dl)) for dl in tag_dl: strings = list(dl.strings) #print(strings.strip()) try: name = strings[1] #print(name) address = strings[3].strip() sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) except Exception as e: name is None #print(results) #store table = pd.DataFrame(results, columns=['name','address','sido','gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) print(table) table.to_csv( '{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_goobne(): results = [] url = 'http://www.goobne.co.kr/store/search_store.jsp' #첫 페이지로딩 wd = webdriver.Chrome( 'C:/Users/minkyu/Desktop/코딩프로그램/chromedriver.exe') #크롬드라이버 실행 wd.get(url) #url get방식으로 받아오기 (그냥 있는 그대로 받아오기 ) post는 수정 time.sleep(5) #5초 대기 for page in count(start=1): # count(start=1) 처음부터 끝까지 돌음 script = 'store.getList(%d)' % page #페이지 값이 변경되면서 print('%s : success for script execute [%s]' % (datetime.now(), script)) wd.execute_script(script) #스크립트 실행 time.sleep(5) # 5초 대기 # 실행결과 HTML(rendering 된 html ) 가져오기 html = wd.page_source #print(html) #parsing with bs4 (필요한데이터 뽑아내기?) bs = BeautifulSoup(html, 'html.parser') #html파서 호출 tag_tbody = bs.find('tbody', attrs={'id': 'store_list' }) #tbody라는 태그의 속성이 id고 store_list인것 tags_tr = tag_tbody.findAll('tr') #tbody안에 모든 tr을 가져옴 #마지막 검출 if tags_tr[0].get('class') is None: #받아오는 tags_tr[0]의 클래스가 None이면 멈춤 break for tag_tr in tags_tr: strings = list(tag_tr.strings) #태그안에 있는 스트링 모두를 리스트로 가져오기4 name = strings[1] #지점을 담고 address = strings[6] #주소를 담고 sidogu = address.split(' ')[0:2] #주소 스플릿해서 시도를 담고 results.append((name, address) + tuple(sidogu)) print(results) #store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) # 데이터프레임생성(테이블) , table['sido'] = table.sido.apply( lambda v: sido_dict.get(v, v)) # sido라는 딕셔너리에 없으면 그냥 내값을 리턴해라 table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv( '{0}/goobne_table.csv'.format(RESULT_DIRECTORY), # csv로 디렉토리에 저장 encoding='utf-8', mode='w', index=True)
def crawling_goobne( err=lambda e: print('%s : %s' % (e, datetime.now()), file=sys.stderr)): results = [] url = 'https://www.goobne.co.kr/store/search_store.jsp' # 첫 페이지 로딩 browser = webdriver.Chrome('D:\pythonPycharm\chromedriver') browser.get(url) # wait page loading... time.sleep(3) for page in count(start=1): # 자바스크립트 실행 script = 'store.getList(%d)' % page browser.execute_script(script) time.sleep(1) html = browser.page_source try: bs = BeautifulSoup(html, 'html.parser') tag_tbody = bs.find('tbody', attrs={'id': 'store_list'}) tags_tr = tag_tbody.findAll('tr') # 마지막 페이지 if len(tags_tr) == 1: break for tag_tr in tags_tr: # # 전화번호가 없는 경우 에러 발생 # strings = list(tag_tr.strings) # name = strings[1] # address = strings[6] # sido = address.split()[0] # gungu = address.split()[1] # print(name, address, sido, gungu) name = tag_tr.find('td').text address = tag_tr.find('td', attrs={ 'class': 't_left' }).text.strip()[:-15] sido = address.split()[0] gungu = address.split()[1] results.append((name, address, sido, gungu)) except AttributeError as e: err(e) table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/goobne_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_goobne(): url = 'http://www.goobne.co.kr/store/search_store.jsp' # 첫 페이지 로딩 wd = webdriver.Chrome('D:/bigdata/chromedriver/chromedriver.exe') wd.get(url) time.sleep(5) results = [] for page in count(start=1): # 자바스크립트 실행 # <a href="javascript:store.getList('3');">3</a> url이 아니라 자바 스크립트를 실행시킨다는 의미 script = 'store.getList(%d)' % page wd.execute_script(script) # 스크립트 실행 print('%s : success for script execute [%s]' % (datetime.now(), script)) time.sleep(5) # 실행결과 HTML(rendering된 HTML) 가져오기 html = wd.page_source # 소스 가져오기 # parsing with bs4 bs = BeautifulSoup(html, 'html.parser') tag_tbody = bs.find('tbody', attrs={'id': 'store_list'}) tags_tr = tag_tbody.findAll('tr') # 마지막 검출 if tags_tr[0].get('class') is None: # <tr class="on 이부분이 on이 아니면 클래스가 없다는 의미 break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[6] sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) print(results) # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) # get(v, v) 의 의미는 앞의 v 값이 없으면 뒤의 v값을 리턴해준다. table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv( # 파일 저장 '{0}/gooubne_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_kyochon(): result = [] for sido1 in range(1, 18): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % ( sido1, sido2) html = cw.crawling(url=url) if html is None: break bs = BeautifulSoup(html, 'html.parser') tag_ul = bs.find('ul', attrs={'class': 'list'}) for tag_a in tag_ul.findAll('a'): tag_dt = tag_a.find('dt') if tag_dt is None: break name = tag_dt.get_text() tag_dd = tag_a.find('dd') if tag_dd is None: break address = tag_dd.get_text().strip().split('\r')[0] sidogu = address.split()[:2] result.append((name, address) + tuple(sidogu)) table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu']) # 중복 제거 table = table.\ drop_duplicates(subset='name', keep='first').\ reset_index(drop=True) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table = table.drop_duplicates(subset='name', keep='first').\ reset_index(drop=True).\ reset_index().\ set_index('index') table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_pelicana(): results = [] for page in count(start=1): url = 'http://www.pelicana.co.kr/store/stroe_search.html?gu=&si=&page=%d' % ( page) html = cw.crawling(url=url) # print(url) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class': 'table mt20'}) # print(tag_table) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') # print(tags_tr) #끝 페이지 검출 if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) # print(strings) name = strings[1] # print(name) address = strings[3] # print(address.split()) sidogu = address.split()[:2] #슬라이싱 이용 # print(sidogu) results.append((name, address) + tuple(sidogu)) #튜플로 넣어주는게 낫다 #튜플과 튜플을 머지 시켜주면 sidogu가 리스트나 튜플로 안나옴. print(results) # print(page + ":" + len(tags_tr), sep=':') #proc 모든 데이터를 처리하기위해서 프록을 따로 쓸수 없음 # print(results) #로그 남기기 # print('%s: success for request [%s]' % (datetime.now(), url)) #store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) # print(table) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) #처리까지 됐다. table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def store_nene(data): table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table = table.drop_duplicates(subset='name', keep='first').\ reset_index(drop=True).\ reset_index().\ set_index('index') table.to_csv('{0}/nene_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_kyochon(): results = [] while True: for sido1 in range(1, 18): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d' % ( sido1, sido2) html = cw.crawling(url=url) if html == None: break bs = BeautifulSoup(html, 'html.parser') # tag_table = bs.find('div', attrs={'class': 'shopSchList'}) # tag_tbody = tag_table.find('ul', attrs={'class': 'list'}) # tags_tr = tag_tbody.findAll('li') tag_table = bs.find('ul', attrs={'class': 'list'}) tags_tr = tag_table.findAll('li') for tag_tr in tags_tr: strings = list(tag_tr.strings) if '검색결과가 없습니다.' not in strings: name = strings[3] # address = strings[5].replace('\t', '').replace('\r', '').replace('\n', '') temp_address = strings[5] print(temp_address) address = ','.join(temp_address.split()).replace( ',', ' ') sido = address.split()[:2] results.append((name, address) + tuple(sido)) # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True) if sido1 == 17: break
def crawling_kyochon(): sido1 = [ '서울', '부산', '대구', '인천', '광주', '대전', '울산', '세종', '경기', '강원', '충북', '충남', '전북', '전남', '경북', '경남', '제주' ] results = [] for sido1 in range(1, 18): # for sido2 in count(start=1): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d' % ( sido1, sido2) html = cw.crawling(url=url) if html is None: break else: try: bs = BeautifulSoup(html, 'html.parser') # div-ul-li-<dl><dt><dd> tag_div = bs.find('div', attrs={'class': 'shopSchList'}) tag_ul = tag_div.find('ul') tags_li = tag_ul.findAll('li') for tag_li in tags_li: strings = list(tag_li.strings) name = strings[3] address = str(strings[5]).strip() sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) # 변경불가 table = pd.DataFrame( results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply( lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply( lambda v: gungu_dict.get(v, v)) # store table.to_csv( '{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index='True') except Exception as e: print(e) pass
def crawling_pelicana(): results = [] for page in count(start=1): # 1부터 진행된다 탈출조건은 만들어 줘야 한다. url = 'http://www.pelicana.co.kr/store/stroe_search.html?page=%d&branch_name=&gu=&si=' % page html = cw.crawling(url=url) # print("html result === ", html) bs = BeautifulSoup(html, 'html.parser') # print("beautiful === ", bs) tag_table = bs.find('table', attrs={"class":'table mt20'}) # table 태그에서 class=table mt20 라인에서 시작하여 </table>이 나올때까지 스크랩한다. # print("tag_table === ",tag_table) tag_tbody = tag_table.find('tbody') # tbody 태그에서 시작하여 </tbody> 나올때까지 스크랩한다. # print("tag_tbody === ", tag_tbody) tags_tr = tag_tbody.findAll('tr') # tr 태그에서 시작하여 </tr> 나올때까지 스크랩한다. # print("tags_tr === ", tags_tr) # 끝 검출 if len(tags_tr) == 0: break; for tag_tr in tags_tr: strings = list(tag_tr.strings) print("strings === ",strings) name = strings[1] address = strings[3] sidogu = address.split()[:2] # print("sidogu === ", sidogu) results.append( (name, address) + tuple(sidogu)) # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) # print("table === ", table) table['sido'] = table.sido.apply(lambda v : sido_dict.get(v, v)) # print("table['sido'] === ", table['sido']) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) # print("table['gungu'] === ", table['gungu']) table.to_csv( #파일 저장 '{0}/pelicana_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_kyochon(): results = [] for sido1 in count(start=1): for sido2 in count(start=1): try: # if sido2 is not None: # break url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % ( sido1, sido2) html = cw.crawling(url=url) # if html is None: # break bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('ul', attrs={'class': 'list'}) tag_li = tag_table.find('li') print(tag_li) tag_dl = tag_li.findAll('dl') print(tag_dl) # tag_dt = tag_dl.find('dt') # tags_dd = tag_dl.findAll('dd') for a in tag_dl: strings = list(a.strings) print(strings) # print(strings) name = strings[1] address = strings[3].strip() sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) # # 끝 검출 except: break # print(results) # proc # print(results) # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_pericana(): results = [] #for page in range(1,3): for page in count(start=1): url = 'http://www.pelicana.co.kr/store/stroe_search.html?gu=&si=&page=%d' % page html = cw.crawling(url=url) #print(html) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class': 'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') #print(type(tags_tr),tags_tr) #type : <class 'bs4.element.ResultSet'> #print(len(tags_tr),tags_tr) #끝 검출 if len(tags_tr) == 0: break; #print(page, ":", len(tags_tr), sep=":") for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[3] #print(address.split()) sidogu = address.split()[:2] results.append((name,address) + tuple(sidogu)) #튜플만들기 #print(results) #store table = pd.DataFrame(results, columns=['name','address','sido','gungu']) #print(table) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v,v)) #서울:서울특별시 #->sido의 기존값을 sido_dict value와 비교하여 있으면 그대로, 없으면 넣는다 table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v,v)) print(table) table.to_csv( '{0}/pelicana_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def store_nene(data): table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class': 'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') # pass '''
def crawling_pelicana(): results = [] # page값은 1부터 계속 상승... 내부에서 break for page in count(start=1): print(page, ":", end=" ") url = 'http://pelicana.co.kr/store/stroe_search.html?page=' + str( page) + '&branch_name=&gu=&si=' html = crawling(url=url) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={"class": "table mt20"}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') # 마지막 페이지 if len(tags_tr) == 0: break # tuple로 변환 for tag_tr in tags_tr: strs = list(tag_tr.strings) name = strs[1] address = strs[3] sidogu = address.split(" ")[:2] # 튜플을 union 하면서 아래와 같은 결과물을 얻음 # [('황간점', '충청북도 영동군 황간면 남성리 558-1', '충청북도', '영동군'), ...] results.append((name, address) + tuple(sidogu)) # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) # columns 순서 주의 # sido_dict는 다음과 같은 dictionary 이다. {'서울시': '서울특별시', '서울': '서울특별시', '강원': '강원도 ... } # dictionary.values = dictionary.get(dictionary.keys) # sido_dict.get('서울시')를 하면, '서울시'는 key 이므로, value인 '서울특별시'가 반환된다. # get(v, v) --> 만약 key값 v가 없으면, v를 그대로 반환한다. # sido_dict.get(v, v) by passing an anonymous function as an argument to Series.apply(). table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_pelicana(): results = [] for page in count(start=1): #import url = 'http://www.pelicana.co.kr/store/stroe_search.html?page=%d&branch_name=&gu=&si=' % ( page) print(page) html = cw.crawling(url=url) bs = BeautifulSoup(html, 'html.parser') # html 파서 tag_table = bs.find('table', attrs={'class': 'table mt20'}) #테이블 속성이 table mt20ㅣ인거 찾으셍 print(tag_table) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') print(tags_tr) #끝 검출 if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) #리스트로 변경해서 개행과 탭,스트링을 가진 리스트로출력 print(strings) name = strings[1] #리스트에서 지점이름인덱스 address = strings[3] #리스트에서 주소인덱스 #print(address.split())#주소값을 분리해서 리스트에 넣음 sidogu = address.split()[:2] #슬라이싱으로 처음부터 2개만 뽑음 results.append((name, address) + tuple(sidogu)) # 이름 주소 시도구를 넣은 튜플을 생성, 데이터 변경을 방지 #store #print(results) table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get( v, v)) # v에 sido값을 주고 그 값을 리턴값으로 변경, 다르지 아느면 그냥 내비둠 table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_kyuchon(): results = [] for sido1 in range(1, 18): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % ( sido1, sido2) html = cw.crawling(url=url) if html == None: break bs = BeautifulSoup(html, 'html.parser') # html 파서 tag_div = bs.find('div', attrs={'class': 'shopSchList'}) # 찾으려는 태그 속성이 ~~인거 찾으 # tag_ul = tag_div.find('ul') # 순차적으로 들어가야됨 한번에 뽑으면 파싱이상 tag_lis = tag_div.findAll('li') for tag_li in tag_lis: strings = list( tag_li.strings) # 리스트로 변경해서 개행과 탭,스트링을 가진 리스트로출력 try: name = strings[3] # 리스트에서 지점이름인덱스 address = strings[5].strip() # 리스트에서 주소인덱스 #print(address.split())#주소값을 분리해서 리스트에 넣음 sidogu = address.split()[:2] # 슬라이싱으로 처음부터 2개만 뽑음 results.append((name, address) + tuple(sidogu)) #print(results) except Exception as e: print('%s : %s' % (e, datetime.now()), file=sys.stderr) table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get( v, v)) # v에 sido값을 주고 그 값을 리턴값으로 변경, 다르지 아느면 그냥 내비둠 table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv( '{0}/kyuchon_table.csv'.format(RESULT_DIRECTORY), #csv로 디렉토리에 저장 encoding='utf-8', mode='w', index=True)
def crawling_kyochon(): results = [] for sido1 in range(1, 18): for sido2 in count(start=1): try: url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % ( sido1, sido2) html = cw.crawling(url=url) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('div', attrs={'class': 'shopSchList'}) # print(tag_table) tags_li = tag_table.findAll('li') # print('tag_tbody:',tag_tbody) #tags_dl = tag_tbody.findAll('dl') for tag_li in tags_li: strings = list(tag_li.strings) print('strings', strings) name = strings[3] address = strings[5] address = address.strip() sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) except: break # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) # apply를 통해 lambda함수의 v값이 들어와서 sido를 v로 채움 table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/kyonchon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_kyochon( err=lambda e: print('%s : %s' % (e, datetime.now()), file=sys.stderr)): results = [] for sido1 in range(1, 17): for sido2 in count(start=1): print(sido1, ", ", sido2, " :", end=" ") url = 'http://www.kyochon.com/shop/domestic.asp?sido1=' + str( sido1) + '&sido2=' + str(sido2) + '&txtsearch=' html = crawling(url=url) try: bs = BeautifulSoup(html, 'html.parser') tag_div = bs.find('div', attrs={'class': 'shopSchList'}) tags_dl = tag_div.findAll('dl') # 마지막 페이지 if len(tags_dl) == 0: break for tag_dl in tags_dl: name = tag_dl.find('dt').text address = tag_dl.find('dd').text.strip().replace( "\t", "").split("\r\n")[0] sido = address.split()[0] gungu = address.split()[1] results.append((name, address, sido, gungu)) except AttributeError as e: err(e) table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_kyochon(): results = [] for sido1 in range(1, 18): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' %(sido1, sido2) html = cw.crawling(url=url) if html == None: break bs = BeautifulSoup(html, 'html.parser') tag_li = bs.find('div', attrs={'class':"shopSchList"}) tag_ul = tag_li.find('ul') tags_li = tag_ul.findAll('li') for tag_li in tags_li: strings = list(tag_li.strings) if strings[0] == '검색결과가 없습니다.': break name = strings[3] address = strings[6].strip().replace('(','').replace(')','').replace(' ', '') sidogu = strings[5].split()[:2] results.append((name, address) + tuple(sidogu)) table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/kyochon_table2.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_kyochon(): results = [] for sido1 in range(1, 18): for sido2 in count(start=1): print("sido1 === ", sido1) print("sido2 === ", sido2) url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % (sido1, sido2) # print("url===",url) html = cw.crawling(url=url) # print("html === ",html) if html == None: print("파일없어요") break bs = BeautifulSoup(html, 'html.parser') # print("bs === ", bs) tag_div = bs.find('div', attrs={"class" : "shopSchList"}) # print("tag_table === ", tag_div) tag_ul = tag_div.find('ul', attrs={"class" : "list"}) # print("tag_ul === ", tag_ul) # tag_li = tag_ul.find('li') # print("tag_li === ", tag_li) tag_dl = tag_ul.findAll('dl') # print("tag_dl === ", tag_dl) # tag_dt = tag_dl.find('dt') # print("tag_dt === ", tag_dt) # tag_dd = tag_dl.findAll('dd') # print("tag_dd === ", tag_dd) for dl in tag_dl: # print("dl ==== loop ",dl) try: strings = list(dl.strings) print("strings === ", strings) print("strings[1] =====", strings[1]) name = strings[1] + "점" address = strings[3] # print(strings[3]) address_after = re.sub("[\rnt]", "", address) # print("address_after === ",address_after) address_strip = address_after.strip() print("address_after.strip() === ", address_strip) sidogu = address.split()[:2] print("sidogu === ", sidogu) results.append((name, address_strip) + tuple(sidogu)) print("results === ", results) except Exception as e: print("오류 === ", e) continue table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv( # 파일 저장 '{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)