def ex02(): crawler.crawling( url='https://movie.naver.com/movie/sdb/rank/rmovie.nhn', encoding='cp949', proc1=proc_naver_movie_rank, proc2=lambda data: list( map(lambda div: print(div.a.text, div.a['href'], sep=':'), data)))
def crawling_kyochon(): results = [] for sido1 in count(1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=&txtsearch=' % ( sido1) html = crawler.crawling(url) if html is False: break for sido2 in count(1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % ( sido1, sido2) html = crawler.crawling(url) if html is False: break print(sido1, sido2, sep=' / ') bs = BeautifulSoup(html, 'html.parser') tag_ul = bs.find('ul', attrs={'class': 'list'}) tag_spans = tag_ul.findAll('span', attrs={'class': 'store_item'}) for tag_span in tag_spans: strings = list(tag_span.strings) name = strings[1] address = strings[3].replace('\r\n\t', '').strip() sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu))
def ex02(): crawler.crawling( url='http://movie.naver.com/movie/sdb/rank/rmovie.nhn', encoding='cp949', proc1=proc_naver_movie_rank, proc2=lambda data: list( map(lambda t: print(t[0], t[1].a.text, t[1].a['href'], sep=':'), enumerate(data))))
def ex02(): crawler.crawling( # 패치작업 url='http://movie.naver.com/movie/sdb/rank/rmovie.nhn', encoding='cp949', # 프로세싱 작업 proc1=proc_naver_movie_rank, proc2=lambda data: list(map(lambda x: print(x.a), data)))
def ex02(): # fetch crawler.crawling( url='https://movie.naver.com/movie/sdb/rank/rmovie.nhn', encoding='cp949', proc=proc_naver_movie_rank, proc2=lambda data: list( map( lambda t: print(t[0] + 1, t[1].a.text, t[1].a['href'], sep=':' ), enumerate(data))) # err=error() 에러를 먹어버린다. )
def ex02(): crawler.crawling( url='https://movie.naver.com/movie/sdb/rank/rmovie.nhn', encoding='cp949', proc1=proc_naver_movie_rank, proc2=lambda data: list( map( lambda div: print( div[0], div[1].a.text, div[1].a['href'], sep=":::"), enumerate(data))) # err=error )
def ex02(): crawler.crawling( url= 'https://movie.naver.com/movie/sdb/rank/rmovie.nhn?sel=cnt&date=20190617', encoding='cp949', err=error, proc1=proc_naver_movie_rank, proc2=lambda data: list( map( lambda div: print(div.a.text, 'https://movie.naver.com/' + div.a['href'], sep=" : "), data)))
def crawling_pelicana(): results = [] for page in count(start=113): url = 'https://pelicana.co.kr/store/stroe_search.html?branch_name=&gu=&si=&page=%d' % page html = crawler.crawling(url) # html = get_html(url) bs = BeautifulSoup(html, 'html.parser') # # ---- 1 # trs = bs.table.findAll("tr") # for i in range(1, len(trs) - 1): # td = trs[i].findAll('td') # title = td[0].text # addr = td[1].text # call = td[2].text.strip() # sidogu = addr.split()[:2] # ---- 2 tag_table = bs.find('table', attrs={'class': 'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') # 끝 검출 if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[3] sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu))
def crawling_kyochon(): # ################################################################################################################## # 이 함수는 데이터의 존재여부를 판단하여 루프를 도는 제어문만을 포함하도록 한다. ################################### # 데이터의 전처리 및 저장은 별도의 함수를 사용하도록 한다 ########################################################## # 전처리 : preprocessing_kyochon, 저장 : store_kyochon ############################################################# #################################################################################################################### total = [] for sido1 in range(1, 18): condition = True results = [] for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1={0}&sido2={1}&txtsearch='.format( sido1, sido2) result = cw.crawling(url, proc=preprocessing_kyochon) if result == None: break else: results.extend(result) continue total.extend(results) print(total) store_kyochon(total)
def crawling_mnet_month_chart(): results = [] RESULT_DIRECTORY = '__result__' # for page in count(start=200901): for year in count(start=2009): for month in range(1, 12, 1): page = "%s%s"% (year, month) url = 'http://www.mnet.com/chart/TOP100/%s' % page html = cw.crawling(url=url) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class' : 'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') print(page, ":", len(tags_tr), sep=':') # 끝 검출 if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[3] sidogu = address.split()[:2] results.append( (name, address) + tuple(sidogu) ) # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table.to_csv('{0}/mnet_month_100.csv'.format(RESULT_DIRECTORY))
def crawling_nene(): result = [] # 시작과 끝이 있으면 range, 없으면 count for page in count(start=1): html = crawling( 'https://nenechicken.com/17_new/sub_shop01.asp?page={}&ex_select=1&ex_select2=&IndexSword=&GUBUN=A' .format(page)) bs = BeautifulSoup(html, 'html.parser') tags_div = bs.findAll('div', attrs={'class': 'shopInfo'}) for tag_div in tags_div: shopname = tag_div.find('div', attrs={'class': 'shopName'}).text shopaddress = tag_div.find('div', attrs={'class': 'shopAdd'}).text result.append((shopname, shopaddress)) if (len(tags_div) < 24): break # Store table = pd.DataFrame(result, columns=['name', 'address']) table.to_csv('{0}/nene_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True) print(table)
def crawling_kyochon(): results = [] for sido1 in range(1, 2): # range count로 변경 필요 for sido2 in range(24, 27): url = 'http://www.kyochon.com/shop/domestic.asp?sido1={0}&sido2={1}&txtsearch='.format( sido1, sido2) html = crawler.crawling(url, encoding='utf-8') # 끝 검출 if html is None: break # parser bs = BeautifulSoup(html, 'html.parser') tag_ul = bs.find('ul', attrs=['class', 'list']) tag_span_store_items = tag_ul.findAll( 'span', attrs=['class', 'store_item']) for t in tag_span_store_items: strings = list(t.strings) # print(strings) name = strings[1] address = strings[3].replace('\r', '').replace('\t', '').replace('\n', '') # address = strings[3].strip('\r\n\t') sidogu = address.split()[:2] results.append((name, address, tuple(sidogu))) print(results)
def crawling_pelicana(): results = [] for page in count(start=1): url = 'https://pelicana.co.kr/store/stroe_search.html?page=%d&branch_name=&gu=&si=' % page try: html = crawler.crawling(url) except Exception as e: continue bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class': 'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') # 끝 검출 if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[3] sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gugun']) table.to_csv('__results__/pelicana.csv', encoding='utf-8', mode='w', index=True)
def crawling_pelicana(): results = [] for page in count(start=1): url = 'http://www.pelicana.co.kr/store/stroe_search.html?gu=&si=&page=%d' % page html = cw.crawling(url=url) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class': 'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') # 끝 검출 if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[3] sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) #store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_nene(): results = [] for page in count(start=1, step=1): url = 'https://nenechicken.com/17_new/sub_shop01.asp?page=%d' % page html = crawler.crawling(url) bs = BeautifulSoup(html, 'html.parser') divs = bs.findAll('div', attrs={'class': 'shopInfo'}) for div in divs: div_shop = div.find('div', attrs={'class': 'shopName'}) div_add = div.find('div', attrs={'class': 'shopAdd'}) name = list(div_shop.strings)[0] # print(name) address = list(div_add.strings)[0] sidogu = address.split()[0:2] # print(sidogu) t = (name, address) + tuple(sidogu) # print(t) results.append(t) if (page >= 47) and (name == '서울구로구고척스카이돔점'): break # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gugun']) table.to_csv('results/nene.csv', encoding='utf-8', mode='w', index=True)
def crawling_pelicana(): results = [] for index in count(start=1, step=1): url = f"https://pelicana.co.kr/store/stroe_search.html?page={index}&branch_name=&gu=&si=" html = crawler.crawling(url) bs = BeautifulSoup(html, "html.parser") tag_table = bs.find("table", attrs={"class": ["table", "mt20"]}) tag_tbody = tag_table.find("tbody") tags_tr = tag_tbody.find_all("tr") #end 검출 if (len(tags_tr)) == 0: print("No results : finish at index no. " + str(index)) break for tag_tr in tags_tr: datas = list(tag_tr.strings) name = datas[1] address = datas[3] citi = address.split()[:2] t = (name, address) + tuple(citi) results.append(t) # print(name, address, citi) # print(len(tags_tr)) # store (thru pandas) table = pd.DataFrame(results, columns=["name", "address", "sido", "gugun"]) table.to_csv("results/pelicana.csv", encoding="utf-8", mode="w", index=True)
def crawling_kyochon(): results = [] for sido1 in range(1, 18): for sido2 in count(start=1): url = ( "http://www.kyochon.com/shop/domestic.asp?sido1={}&sido2={}&txtsearch=" ).format(sido1, sido2) html = crawling(url) if html is None: break bs = BeautifulSoup(html, "html.parser") tag_ul = bs.find("ul", attrs={"class": "list"}) for tag_a in tag_ul.findAll("a"): tag_dt = tag_a.find("dt") if tag_dt is None: break name = tag_dt.get_text() address = tag_a.find("dd").get_text().strip().split("\r\n")[0] results.append((name, address)) table = pd.DataFrame(results, columns=["name", "address"]) table.to_csv("{0}/kyochon_table.csv".format(RESULT_DIRECTORY), encoding="utf-8", mode="w", index=True)
def crawling_nene(): results = [] # for page in range(1, 5): for page in count(start=1): html = crawling( "https://nenechicken.com/17_new/sub_shop01.asp?page=%d&ex_select=1&ex_select2=&IndexSword=&GUBUN=A" % page) bs = BeautifulSoup(html, "html.parser") tags_div = bs.findAll("div", attrs={"class": "shopInfo"}) for tag_div in tags_div: name = tag_div.find("div", attrs={"class": "shopName"}).text address = tag_div.find("div", attrs={"class": "shopAdd"}).text results.append((name, address)) if (len(tags_div) < 24): break table = pd.DataFrame(results, columns=["name", "address"]) table.to_csv("{0}/nene_table.csv".format(RESULT_DIRECTORY), encoding="utf-8", mode="w", index=True)
def crawling_pelicana(): results = [] # for page in range(1, 2): for page in count(start=1): html = crawling( "http://pelicana.co.kr/store/stroe_search.html?page={}&branch_name=&gu=&si=" .format(page)) bs = BeautifulSoup(html, "html.parser") tag_table = bs.find("table", attrs={"class": "table mt20"}) tag_tbody = tag_table.find("tbody") tags_tr = tag_tbody.findAll("tr") if (len(tags_tr) == 0): break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[3] results.append((name, address)) table = pd.DataFrame(results, columns=["name", "address"]) table.to_csv("{0}/pelicana_table.csv".format(RESULT_DIRECTORY), encoding="utf-8", mode="w", index=True)
def crawling_kyochon(): results = [] for sido1 in range(1, 2): for sido2 in range(1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d' % (sido1, sido2) html = crawler.crawling(url) #끝검출 if html is None: break bs = BeautifulSoup(html, 'html.parser') tag_ul = bs.find('ul', attrs={'class': 'list'}) tags_span = tag_ul.findAll('span', attrs={'class': 'store_item'}) for tag_span in tags_span: strings = list(tag_span.strings) # print(strings) name = strings[1] address = strings[3].strip('\r\n\t') sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) for t in results: print(t)
def crawling_mnet_week_chart(): results = [] RESULT_DIRECTORY = '__result__' year = 2017 for month in range(1, 13, 1): if month / 10 != 1: month = "0%s" % month page = "%s%s" % (year, month) url = 'http://www.mnet.com/chart/TOP100/{0}'.format(page) html = cw.crawling(url=url) now = "%s%s" % (dt.now().year, dt.now().month) bs = BeautifulSoup(html, 'html.parser') tag_tbody = bs.find('tbody') tags_tr = tag_tbody.findAll('tr') for tag_tr in tags_tr: tag_td = tag_tr.find('td', attrs={'class': 'MMLItemTitle'}) title_of_song = tag_td.find('a', attrs={'class': 'MMLI_Song'}) results.append((title_of_song.get_text())) title = list(set(results)) # store # table = pd.DataFrame(a, columns=['title', '', '', '']) table = pd.DataFrame(title, columns=['title']) table.to_csv('{0}/mnet_weeks_100.csv'.format(RESULT_DIRECTORY))
def crawling_kyochon(): results = [] for sido1 in range(1, 18): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % ( sido1, sido2) html = crawler.crawling(url) # 끝 검출 if html is None: break bs = BeautifulSoup(html, 'html.parser') tag_ul = bs.find('ul', attrs={'class': 'list'}) tags_spans = tag_ul.findAll('span', attrs={'class': 'store_item'}) for tag_span in tags_spans: strings = list(tag_span.strings) # print(strings) name = strings[1] address = strings[3].strip('\r\n\t') sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gugun']) # table.to_csv('__results__/nene.csv', encoding='UTF-8', mode='w', index=True) print(table)
def crawling_kyochon(): results = [] for sido in range(1, 18): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d' % (sido, sido2) html = cw.crawling(url=url) if html is None: break bs = BeautifulSoup(html, 'html.parser') tag_li = bs.find('div', attrs={'class': 'shopSchList'}) tags_dl = tag_li.findAll('dl') for tag_dl in tags_dl: strings = list(tag_dl.strings) if strings[0] == '검색결과가 없습니다.': break else: name = strings[1] address = strings[3].strip() sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_kyochon(): results = [] for sido in range(1, 18): for sido2 in range(1, 27): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % ( sido, sido2) html = crawler.crawling(url) if html is None: break bs = BeautifulSoup(html, 'html.parser') tag_spans = bs.findAll(attrs={'class': 'store_item'}) for tag_span in tag_spans: strings = list(tag_span.strings) name = strings[1] # 공백 및 개행 제거 addr = strings[3].strip('\r\n\t') sidogu = addr.split()[:2] results.append((name, addr) + tuple(sidogu)) for t in results: print(t)
def ex01(): results = [] # count(시작점) 무한대로감, break 반드시 필요 for page in count(start=113): url = 'https://pelicana.co.kr/store/stroe_search.html?branch_name=&gu=&si=&page=%d'%page html = crawler.crawling(url) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class':'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') # 끝 검출 if len(tags_tr) == 0: break for tag_tr in tags_tr: # tag 다 빼고 string 만 strings = list(tag_tr.strings) print(tag_tr) print(strings) name = strings[1] address = strings[3] sidogu = address.split()[:2] t = (name, address)+tuple(sidogu) results.append(t)
def crawling_nene(): results = [] endstr = '' for page in range(1, 5): url = 'https://nenechicken.com/17_new/sub_shop01.asp?page=%d&ex_select=1&ex_select2=&IndexSword=&GUBUN=A' % page html = crawler.crawling(url=url) # 파싱할 데이터 추출 bs = BeautifulSoup(html, 'html.parser') divs = bs.findAll('div', attrs={'class': 'shopInfo'}) # 끝값체크를 위한 첫번째 가맹점 이름 div = bs.find('div', attrs={'class': 'shopName'}).text # 끝값체크 if endstr == div: break for i, div in enumerate(divs): strings = list(div.strings) if len(strings) == 10: name, address = strings[6], strings[8] else: name, address = strings[4], strings[6] if i == 0: endstr = name sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu))
def crawling_nene(): results = [] firstname = '' currentname = ' ' for page in range(1, 5): url = 'https://nenechicken.com/17_new/sub_shop01.asp?page=%d&ex_select=1&ex_select2=&IndexSword=&GUBUN=A' % page html = crawler.crawling(url) bs = BeautifulSoup(html, 'html.parser') divs = bs.find('div', attrs={'class': 'shopWrap'}) shoplist = divs.findAll('div', attrs={'class': 'shop'}) currentname = shoplist[0].find('div', attrs={'class': 'shopName'}).text if firstname == currentname: break firstname = currentname for shop in shoplist: strings = list(shop.strings) if strings[8] == 'Pizza': name = strings[12] address = strings[14] sidogu = address.split()[:2] else: name = strings[10] address = strings[12] sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu))
def crawling_pelicana(): results = [] for index in count(start=110,step=1): url = f'https://pelicana.co.kr/store/stroe_search.html?page={index}&branch_name=&gu=&si=' html = crawler.crawling(url) bs = BeautifulSoup(html,'html.parser') tag_table = bs.find('table',attrs={'class':['table','mt20']}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') # 끝 검출 if len(tags_tr) == 0 : break for tag_tr in tags_tr : datas = list(tag_tr.strings) name = datas[1] address = datas[3] sidogu=address.split(' ')[:2] t=(name,address) + tuple(sidogu) results.append(t) # print(name,address,sidogu) print(results)
def crawling_pelicana(): results = [] for index in count(start=110, step=1): url = f'https://pelicana.co.kr/store/stroe_search.html?page={index}&branch_name=&gu=&si=' html = crawler.crawling(url) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class': ['table', 'mt20']}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') # 끝 검출 if len(tags_tr) == 0: break for tag_tr in tags_tr: datas = list(tag_tr.strings) # 태그 없앤 후 개행으로 리스트에 담기 name = datas[1] address = datas[3] sidogugun = address.split()[:2] # split 한 주소를 필요한 시.도.구 까지만 슬라이싱 t = (name, address) + tuple(sidogugun) results.append(t) # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gugun']) table.to_csv('results/pelicana.csv', encoding='utf-8', mode='w', index=True)
def crawling_pelicana(): result = [] # 시작과 끝이 있으면 range, 없으면 count for page in count(start=1): html = crawling( 'http://pelicana.co.kr/store/stroe_search.html?page={}&branch_name=&gu=&si=' .format(page)) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class': 'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') # 마지막 페이지 검출 if (len(tags_tr) == 0): break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[3] result.append((name, address)) #print("{} : {}".format(name, address)) # Store table = pd.DataFrame(result, columns=['name', 'address']) table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True) print(table)