Beispiel #1
0
def store_nene(data):
    table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/nene_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
Beispiel #2
0
def crawling_bbq(
        err=lambda e: print('%s : %s' % (e, datetime.now()), file=sys.stderr)):
    results = []

    url = 'https://www.bbq.co.kr/page/order/store-search_left.asp?lat=37.491872&lng=127.115922&schval=%s' % (
        urllib.parse.quote('점'))
    html = crawling(url=url)

    try:
        bs = BeautifulSoup(html, 'html.parser')
        tags_div = bs.findAll('div', attrs={'class': 'storeNearyByItem-title'})
        items = bs.findAll('div', attrs={'class': 'storeNearyByItem-address'})

        for i in range(len(tags_div)):
            name = tags_div[i].find('span').text
            address = items[i].text.strip()
            sido = address.split()[0]
            gungu = address.split()[1]
            results.append((name, address, sido, gungu))
    except AttributeError as e:
        err(e)

    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])
    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))
    table.to_csv('{0}/bbq_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #3
0
def crawling_pelicana():
    results = []
    for page in count(start=1):
        url = 'http://www.pelicana.co.kr/store/stroe_search.html?gu=&si=&page=%d' % page
        html = cw.crawling(url=url)

        bs = BeautifulSoup(html, 'html.parser')
        tag_table = bs.find('table', attrs={'class': 'table mt20'})
        tag_tbody = tag_table.find('tbody')
        tags_tr = tag_tbody.findAll('tr')

        # 끝 검출
        if len(tags_tr) == 0:
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)

            name = strings[1]
            address = strings[3]
            sidogu = address.split()[:2]

            results.append((name, address) + tuple(sidogu))

    #store
    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #4
0
def crawling_kyochon():
    results = []
    for sido in range(1, 18):
        for sido2 in count(start=1):
            url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d' % (sido, sido2)
            html = cw.crawling(url=url)
            if html is None:
                break

            bs = BeautifulSoup(html, 'html.parser')
            tag_li = bs.find('div', attrs={'class': 'shopSchList'})
            tags_dl = tag_li.findAll('dl')

            for tag_dl in tags_dl:
                strings = list(tag_dl.strings)
                if strings[0] == '검색결과가 없습니다.':
                    break
                else:
                    name = strings[1]
                address = strings[3].strip()
                sidogu = address.split()[:2]

                results.append((name, address) + tuple(sidogu))

        # store
        table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])

        table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
        table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

        table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_goobne():
    url = 'http://www.goobne.co.kr/store/search_store.jsp'

    #첫 페이지 로딩
    wd = webdriver.Chrome(
        'D:\PycharmProjects\chromedriver_win32\chromedriver.exe')
    wd.get(url)
    time.sleep(5)
    # print(wd.page_source)

    results = []
    for page in count(start=1):
        #자바스크립트 실행
        script = 'store.getList(%d)' % page
        wd.execute_script(script)  # 실행
        print('%s : success for script execute [%s]' %
              (datetime.now(), script))
        time.sleep(5)

        # 실행결과 HTML(rendering된 HTML) 가져오기
        html = wd.page_source

        # parsing with bs4
        bs = BeautifulSoup(html, 'html.parser')
        tag_tbody = bs.find('tbody', attrs={'id': 'store_list'})
        tags_tr = tag_tbody.findAll('tr')  #s붙이면 리스트로 된다.
        # print(tag_tbody)

        #마지막 검출
        if tags_tr[0].get('class') is None:
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)
            # print(strings)
            name = strings[1]
            address = strings[6]
            sidogu = address.split()[:2]  #어드레스에서 슬라이싱 해서 뽑아내야한다.

            results.append((name, address) + tuple(sidogu))

        print(results)

    #store
    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))  #리턴값을 세팅
    table['gungu'] = table.sido.apply(
        lambda v: gungu_dict.get(v, v))  # 리턴값을 세팅

    table.to_csv('{0}/goobne_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #6
0
def crawling_cu():
    url = 'http://cu.bgfretail.com/store/list.do?category=store'

    wd = webdriver.Chrome('D:/bigdata/chromedriver/chromedriver.exe')
    wd.get(url)
    # time.sleep(2)

    results = []
    for page in range(1, 201):
        script = 'newsPage(%d)' % page
        wd.execute_script(script)  # 스크립트 실행
        print('%s : success for script execute [%s]' % (datetime.now(), script))
        time.sleep(1)

        html = wd.page_source
        # print(html)

        bs = BeautifulSoup(html, 'html.parser')
        tag_div = bs.find('div', attrs={'class':'detail_store'})
        tag_tbody = tag_div.find('tbody')
        # print("tag_tbody === ", tag_tbody)
        tags_tr = tag_tbody.findAll('tr')
        # print("tag_tr === ", tags_tr)

        # 마지막 검출
        if tags_tr == []:
            print("끄~~~~~~~~~~~~~~~~~~읕!!!!!!!!!!!")
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)
            # print("strings === ",strings)
            name = strings[2]
            phone = strings[4]
            address = strings[10]
            sidogu = address.split()[:2]

            results.append((name, address, phone) + tuple(sidogu))

    print(results)

    # store
    table = pd.DataFrame(results, columns=['name', 'address','phone', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv(  # 파일 저장
        '{0}/cu_table.csv'.format(RESULT_DIRECTORY),
        encoding='utf-8',
        mode='w',
        index=True)
Beispiel #7
0
def crawling_kyochon():

    results = []
    for sido1 in range(1,18):
    #for sido1 in range(1, 5):
        for sido2 in count(start=1):
        #for sido2 in range(2,20):
            url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % (sido1,sido2)
            html = cw.crawling(url=url)

            if html is None:
                break

            bs = BeautifulSoup(html, 'html.parser')
            tag_ul = bs.find('ul', attrs={'class':'list'})
            tag_li = tag_ul.find('li')
            tag_a = tag_li.find('a')

            tag_dl = tag_a.findAll('dl')
            #print('%s : sucess for script execute [%s]' % (datetime.now(), tag_dl))


            for dl in tag_dl:
                strings = list(dl.strings)

                #print(strings.strip())
                try:
                    name = strings[1]
                    #print(name)
                    address = strings[3].strip()
                    sidogu = address.split()[:2]
                    results.append((name, address) + tuple(sidogu))
                except Exception as e:
                    name is None


        #print(results)

        #store

        table = pd.DataFrame(results, columns=['name','address','sido','gungu'])
        table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
        table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))
        print(table)



        table.to_csv(
            '{0}/kyochon_table.csv'.format(RESULT_DIRECTORY),
            encoding='utf-8',
            mode='w',
            index=True)
Beispiel #8
0
def crawling_goobne():
    results = []
    url = 'http://www.goobne.co.kr/store/search_store.jsp'
    #첫 페이지로딩
    wd = webdriver.Chrome(
        'C:/Users/minkyu/Desktop/코딩프로그램/chromedriver.exe')  #크롬드라이버 실행
    wd.get(url)  #url get방식으로 받아오기 (그냥 있는 그대로 받아오기 ) post는 수정

    time.sleep(5)  #5초 대기

    for page in count(start=1):  # count(start=1) 처음부터 끝까지 돌음
        script = 'store.getList(%d)' % page  #페이지 값이 변경되면서
        print('%s : success for script execute [%s]' %
              (datetime.now(), script))
        wd.execute_script(script)  #스크립트 실행
        time.sleep(5)  # 5초 대기

        # 실행결과 HTML(rendering 된 html ) 가져오기
        html = wd.page_source
        #print(html)

        #parsing with bs4 (필요한데이터 뽑아내기?)
        bs = BeautifulSoup(html, 'html.parser')  #html파서 호출
        tag_tbody = bs.find('tbody',
                            attrs={'id': 'store_list'
                                   })  #tbody라는 태그의 속성이 id고 store_list인것
        tags_tr = tag_tbody.findAll('tr')  #tbody안에 모든 tr을 가져옴
        #마지막 검출
        if tags_tr[0].get('class') is None:  #받아오는 tags_tr[0]의 클래스가 None이면 멈춤
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)  #태그안에 있는 스트링 모두를 리스트로 가져오기4
            name = strings[1]  #지점을 담고
            address = strings[6]  #주소를 담고
            sidogu = address.split(' ')[0:2]  #주소 스플릿해서 시도를 담고

            results.append((name, address) + tuple(sidogu))
    print(results)

    #store
    table = pd.DataFrame(results, columns=['name', 'address', 'sido',
                                           'gungu'])  # 데이터프레임생성(테이블) ,
    table['sido'] = table.sido.apply(
        lambda v: sido_dict.get(v, v))  # sido라는 딕셔너리에 없으면 그냥 내값을 리턴해라
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv(
        '{0}/goobne_table.csv'.format(RESULT_DIRECTORY),  # csv로 디렉토리에 저장
        encoding='utf-8',
        mode='w',
        index=True)
Beispiel #9
0
def crawling_goobne(
        err=lambda e: print('%s : %s' % (e, datetime.now()), file=sys.stderr)):
    results = []
    url = 'https://www.goobne.co.kr/store/search_store.jsp'

    # 첫 페이지 로딩
    browser = webdriver.Chrome('D:\pythonPycharm\chromedriver')
    browser.get(url)
    # wait page loading...
    time.sleep(3)

    for page in count(start=1):
        # 자바스크립트 실행
        script = 'store.getList(%d)' % page
        browser.execute_script(script)
        time.sleep(1)
        html = browser.page_source

        try:
            bs = BeautifulSoup(html, 'html.parser')

            tag_tbody = bs.find('tbody', attrs={'id': 'store_list'})
            tags_tr = tag_tbody.findAll('tr')

            # 마지막 페이지
            if len(tags_tr) == 1:
                break
            for tag_tr in tags_tr:
                # # 전화번호가 없는 경우 에러 발생
                # strings = list(tag_tr.strings)
                # name = strings[1]
                # address = strings[6]
                # sido = address.split()[0]
                # gungu = address.split()[1]
                # print(name, address, sido, gungu)
                name = tag_tr.find('td').text
                address = tag_tr.find('td', attrs={
                    'class': 't_left'
                }).text.strip()[:-15]
                sido = address.split()[0]
                gungu = address.split()[1]
                results.append((name, address, sido, gungu))
        except AttributeError as e:
            err(e)
        table = pd.DataFrame(results,
                             columns=['name', 'address', 'sido', 'gungu'])
        table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
        table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))
        table.to_csv('{0}/goobne_table.csv'.format(RESULT_DIRECTORY),
                     encoding='utf-8',
                     mode='w',
                     index=True)
Beispiel #10
0
def crawling_goobne():
    url = 'http://www.goobne.co.kr/store/search_store.jsp'

    # 첫 페이지 로딩
    wd = webdriver.Chrome('D:/bigdata/chromedriver/chromedriver.exe')
    wd.get(url)
    time.sleep(5)

    results = []
    for page in count(start=1):
        # 자바스크립트 실행
        # <a href="javascript:store.getList('3');">3</a> url이 아니라 자바 스크립트를 실행시킨다는 의미
        script = 'store.getList(%d)' % page
        wd.execute_script(script)   # 스크립트 실행
        print('%s : success for script execute [%s]' % (datetime.now(), script))
        time.sleep(5)

        # 실행결과 HTML(rendering된 HTML) 가져오기
        html = wd.page_source  # 소스 가져오기

        # parsing with bs4
        bs = BeautifulSoup(html, 'html.parser')
        tag_tbody = bs.find('tbody', attrs={'id': 'store_list'})
        tags_tr = tag_tbody.findAll('tr')

        # 마지막 검출
        if tags_tr[0].get('class') is None: # <tr class="on 이부분이 on이 아니면 클래스가 없다는 의미
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)
            name = strings[1]
            address = strings[6]
            sidogu = address.split()[:2]

            results.append((name, address) + tuple(sidogu))

    print(results)

    # store
    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) # get(v, v) 의 의미는 앞의 v 값이 없으면 뒤의 v값을 리턴해준다.
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv(  # 파일 저장
        '{0}/gooubne_table.csv'.format(RESULT_DIRECTORY),
        encoding='utf-8',
        mode='w',
        index=True)
Beispiel #11
0
def crawling_kyochon():
    result = []

    for sido1 in range(1, 18):
        for sido2 in count(start=1):
            url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % (
                sido1, sido2)
            html = cw.crawling(url=url)

            if html is None:
                break

            bs = BeautifulSoup(html, 'html.parser')
            tag_ul = bs.find('ul', attrs={'class': 'list'})

            for tag_a in tag_ul.findAll('a'):
                tag_dt = tag_a.find('dt')
                if tag_dt is None:
                    break

                name = tag_dt.get_text()

                tag_dd = tag_a.find('dd')
                if tag_dd is None:
                    break

                address = tag_dd.get_text().strip().split('\r')[0]
                sidogu = address.split()[:2]
                result.append((name, address) + tuple(sidogu))

    table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu'])

    # 중복 제거
    table = table.\
        drop_duplicates(subset='name', keep='first').\
        reset_index(drop=True)

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table = table.drop_duplicates(subset='name', keep='first').\
        reset_index(drop=True).\
        reset_index().\
        set_index('index')

    table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
def crawling_pelicana():
    results = []
    for page in count(start=1):
        url = 'http://www.pelicana.co.kr/store/stroe_search.html?gu=&si=&page=%d' % (
            page)
        html = cw.crawling(url=url)
        # print(url)

        bs = BeautifulSoup(html, 'html.parser')

        tag_table = bs.find('table', attrs={'class': 'table mt20'})
        # print(tag_table)
        tag_tbody = tag_table.find('tbody')
        tags_tr = tag_tbody.findAll('tr')
        # print(tags_tr)
        #끝 페이지 검출
        if len(tags_tr) == 0:
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)
            # print(strings)
            name = strings[1]
            # print(name)
            address = strings[3]
            # print(address.split())
            sidogu = address.split()[:2]  #슬라이싱 이용
            # print(sidogu)

            results.append((name, address) + tuple(sidogu))  #튜플로 넣어주는게 낫다
            #튜플과 튜플을 머지 시켜주면 sidogu가 리스트나 튜플로 안나옴.
            print(results)
        # print(page + ":" + len(tags_tr), sep=':')
    #proc 모든 데이터를 처리하기위해서 프록을 따로 쓸수 없음
    # print(results)
    #로그 남기기
    # print('%s: success for request [%s]' % (datetime.now(), url))

    #store
    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])
    # print(table)

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))  #처리까지 됐다.
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #13
0
def store_nene(data):
    table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table = table.drop_duplicates(subset='name', keep='first').\
        reset_index(drop=True).\
        reset_index().\
        set_index('index')

    table.to_csv('{0}/nene_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #14
0
def crawling_kyochon():
    results = []

    while True:
        for sido1 in range(1, 18):
            for sido2 in count(start=1):
                url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d' % (
                    sido1, sido2)
                html = cw.crawling(url=url)
                if html == None:
                    break

                bs = BeautifulSoup(html, 'html.parser')

                # tag_table = bs.find('div', attrs={'class': 'shopSchList'})
                # tag_tbody = tag_table.find('ul', attrs={'class': 'list'})
                # tags_tr = tag_tbody.findAll('li')
                tag_table = bs.find('ul', attrs={'class': 'list'})
                tags_tr = tag_table.findAll('li')

                for tag_tr in tags_tr:
                    strings = list(tag_tr.strings)
                    if '검색결과가 없습니다.' not in strings:
                        name = strings[3]

                        # address = strings[5].replace('\t', '').replace('\r', '').replace('\n', '')
                        temp_address = strings[5]
                        print(temp_address)
                        address = ','.join(temp_address.split()).replace(
                            ',', ' ')

                        sido = address.split()[:2]

                        results.append((name, address) + tuple(sido))

            # store
            table = pd.DataFrame(results,
                                 columns=['name', 'address', 'sido', 'gungu'])

            table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
            table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))
            table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY),
                         encoding='utf-8',
                         mode='w',
                         index=True)

        if sido1 == 17:
            break
Beispiel #15
0
def crawling_kyochon():

    sido1 = [
        '서울', '부산', '대구', '인천', '광주', '대전', '울산', '세종', '경기', '강원', '충북', '충남',
        '전북', '전남', '경북', '경남', '제주'
    ]
    results = []
    for sido1 in range(1, 18):
        # for sido2 in count(start=1):
        for sido2 in count(start=1):
            url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d' % (
                sido1, sido2)
            html = cw.crawling(url=url)

            if html is None:
                break
            else:
                try:
                    bs = BeautifulSoup(html, 'html.parser')
                    # div-ul-li-<dl><dt><dd>
                    tag_div = bs.find('div', attrs={'class': 'shopSchList'})
                    tag_ul = tag_div.find('ul')
                    tags_li = tag_ul.findAll('li')

                    for tag_li in tags_li:
                        strings = list(tag_li.strings)

                        name = strings[3]
                        address = str(strings[5]).strip()
                        sidogu = address.split()[:2]
                        results.append((name, address) + tuple(sidogu))  # 변경불가
                        table = pd.DataFrame(
                            results,
                            columns=['name', 'address', 'sido', 'gungu'])
                        table['sido'] = table.sido.apply(
                            lambda v: sido_dict.get(v, v))
                        table['gungu'] = table.gungu.apply(
                            lambda v: gungu_dict.get(v, v))

                        # store
                        table.to_csv(
                            '{0}/kyochon_table.csv'.format(RESULT_DIRECTORY),
                            encoding='utf-8',
                            mode='w',
                            index='True')
                except Exception as e:
                    print(e)
                    pass
Beispiel #16
0
def crawling_pelicana():
    results = []
    for page in count(start=1): # 1부터 진행된다 탈출조건은 만들어 줘야 한다.
        url = 'http://www.pelicana.co.kr/store/stroe_search.html?page=%d&branch_name=&gu=&si=' % page
        html = cw.crawling(url=url)
        # print("html result === ", html)

        bs = BeautifulSoup(html, 'html.parser')
        # print("beautiful === ", bs)

        tag_table = bs.find('table', attrs={"class":'table mt20'})  # table 태그에서 class=table mt20 라인에서 시작하여 </table>이 나올때까지 스크랩한다.
        # print("tag_table === ",tag_table)
        tag_tbody = tag_table.find('tbody') # tbody 태그에서 시작하여 </tbody> 나올때까지 스크랩한다.
        # print("tag_tbody === ", tag_tbody)
        tags_tr = tag_tbody.findAll('tr') # tr 태그에서 시작하여 </tr> 나올때까지 스크랩한다.
        # print("tags_tr === ", tags_tr)

        # 끝 검출
        if len(tags_tr) == 0:
            break;

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)
            print("strings === ",strings)

            name = strings[1]
            address = strings[3]
            sidogu = address.split()[:2]
            # print("sidogu === ", sidogu)

            results.append( (name, address) + tuple(sidogu))


    # store
    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])
    # print("table === ", table)

    table['sido'] = table.sido.apply(lambda v : sido_dict.get(v, v))
    # print("table['sido'] === ", table['sido'])
    table['gungu'] = table.gungu.apply(lambda  v: gungu_dict.get(v, v))
    # print("table['gungu'] === ", table['gungu'])

    table.to_csv(   #파일 저장
        '{0}/pelicana_table.csv'.format(RESULT_DIRECTORY),
        encoding='utf-8',
        mode='w',
        index=True)
Beispiel #17
0
def crawling_kyochon():
    results = []

    for sido1 in count(start=1):
        for sido2 in count(start=1):
            try:
                # if sido2 is not None:
                #     break
                url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % (
                    sido1, sido2)
                html = cw.crawling(url=url)
                # if html is None:
                #     break
                bs = BeautifulSoup(html, 'html.parser')
                tag_table = bs.find('ul', attrs={'class': 'list'})
                tag_li = tag_table.find('li')
                print(tag_li)
                tag_dl = tag_li.findAll('dl')
                print(tag_dl)
                # tag_dt = tag_dl.find('dt')
                # tags_dd = tag_dl.findAll('dd')
                for a in tag_dl:
                    strings = list(a.strings)
                    print(strings)
                    # print(strings)
                    name = strings[1]
                    address = strings[3].strip()
                    sidogu = address.split()[:2]
                    results.append((name, address) + tuple(sidogu))
            # # 끝 검출
            except:
                break

    # print(results)

    # proc
    # print(results)
    # store
    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #18
0
def crawling_pericana():
    results = []
    #for page in range(1,3):
    for page in count(start=1):
         url = 'http://www.pelicana.co.kr/store/stroe_search.html?gu=&si=&page=%d' % page
         html = cw.crawling(url=url)
         #print(html)

         bs = BeautifulSoup(html, 'html.parser')
         tag_table = bs.find('table', attrs={'class': 'table mt20'})
         tag_tbody = tag_table.find('tbody')
         tags_tr = tag_tbody.findAll('tr')
         #print(type(tags_tr),tags_tr) #type : <class 'bs4.element.ResultSet'>
         #print(len(tags_tr),tags_tr)


         #끝 검출
         if len(tags_tr) == 0:
             break;

         #print(page, ":", len(tags_tr), sep=":")

         for tag_tr in tags_tr:
             strings = list(tag_tr.strings)
             name = strings[1]
             address = strings[3]
             #print(address.split())
             sidogu = address.split()[:2]

             results.append((name,address) + tuple(sidogu)) #튜플만들기

             #print(results)

    #store
    table = pd.DataFrame(results, columns=['name','address','sido','gungu'])
    #print(table)
    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v,v)) #서울:서울특별시
    #->sido의 기존값을 sido_dict value와 비교하여 있으면 그대로, 없으면 넣는다
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v,v))
    print(table)

    table.to_csv(
        '{0}/pelicana_table.csv'.format(RESULT_DIRECTORY),
         encoding='utf-8',
         mode='w',
         index=True)
Beispiel #19
0
def store_nene(data):

    table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu'])
    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))
    table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
    bs = BeautifulSoup(html, 'html.parser')
    tag_table = bs.find('table', attrs={'class': 'table mt20'})
    tag_tbody = tag_table.find('tbody')
    tags_tr = tag_tbody.findAll('tr')

#    pass

    '''
Beispiel #20
0
def crawling_pelicana():
    results = []
    #  page값은 1부터 계속 상승... 내부에서 break
    for page in count(start=1):
        print(page, ":", end=" ")
        url = 'http://pelicana.co.kr/store/stroe_search.html?page=' + str(
            page) + '&branch_name=&gu=&si='
        html = crawling(url=url)
        bs = BeautifulSoup(html, 'html.parser')

        tag_table = bs.find('table', attrs={"class": "table mt20"})
        tag_tbody = tag_table.find('tbody')
        tags_tr = tag_tbody.findAll('tr')

        # 마지막 페이지
        if len(tags_tr) == 0:
            break

        # tuple로 변환
        for tag_tr in tags_tr:
            strs = list(tag_tr.strings)
            name = strs[1]
            address = strs[3]
            sidogu = address.split(" ")[:2]

            # 튜플을 union 하면서 아래와 같은 결과물을 얻음
            # [('황간점', '충청북도 영동군 황간면 남성리 558-1', '충청북도', '영동군'), ...]
            results.append((name, address) + tuple(sidogu))

    # store
    table = pd.DataFrame(results, columns=['name', 'address', 'sido',
                                           'gungu'])  # columns 순서 주의

    # sido_dict는 다음과 같은 dictionary 이다. {'서울시': '서울특별시', '서울': '서울특별시', '강원': '강원도 ... }
    # dictionary.values = dictionary.get(dictionary.keys)
    # sido_dict.get('서울시')를 하면, '서울시'는 key 이므로, value인 '서울특별시'가 반환된다.
    # get(v, v) --> 만약 key값 v가 없으면, v를 그대로 반환한다.
    # sido_dict.get(v, v) by passing an anonymous function as an argument to Series.apply().
    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #21
0
def crawling_pelicana():
    results = []
    for page in count(start=1):  #import
        url = 'http://www.pelicana.co.kr/store/stroe_search.html?page=%d&branch_name=&gu=&si=' % (
            page)
        print(page)
        html = cw.crawling(url=url)

        bs = BeautifulSoup(html, 'html.parser')  # html 파서
        tag_table = bs.find('table',
                            attrs={'class':
                                   'table mt20'})  #테이블 속성이 table mt20ㅣ인거 찾으셍
        print(tag_table)
        tag_tbody = tag_table.find('tbody')
        tags_tr = tag_tbody.findAll('tr')
        print(tags_tr)
        #끝 검출
        if len(tags_tr) == 0:
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)  #리스트로 변경해서 개행과 탭,스트링을 가진 리스트로출력
            print(strings)
            name = strings[1]  #리스트에서 지점이름인덱스
            address = strings[3]  #리스트에서 주소인덱스
            #print(address.split())#주소값을 분리해서 리스트에 넣음
            sidogu = address.split()[:2]  #슬라이싱으로 처음부터 2개만 뽑음

            results.append((name, address) +
                           tuple(sidogu))  # 이름 주소 시도구를 넣은 튜플을 생성, 데이터 변경을 방지

    #store
    #print(results)
    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(
        v, v))  # v에 sido값을 주고 그 값을 리턴값으로 변경, 다르지 아느면 그냥 내비둠
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #22
0
def crawling_kyuchon():
    results = []
    for sido1 in range(1, 18):
        for sido2 in count(start=1):
            url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % (
                sido1, sido2)
            html = cw.crawling(url=url)
            if html == None:
                break
            bs = BeautifulSoup(html, 'html.parser')  # html 파서
            tag_div = bs.find('div',
                              attrs={'class':
                                     'shopSchList'})  # 찾으려는 태그 속성이 ~~인거 찾으
            # tag_ul = tag_div.find('ul')                                 # 순차적으로 들어가야됨 한번에 뽑으면 파싱이상
            tag_lis = tag_div.findAll('li')

            for tag_li in tag_lis:
                strings = list(
                    tag_li.strings)  # 리스트로 변경해서 개행과 탭,스트링을 가진 리스트로출력

                try:
                    name = strings[3]  # 리스트에서 지점이름인덱스
                    address = strings[5].strip()  # 리스트에서 주소인덱스
                    #print(address.split())#주소값을 분리해서 리스트에 넣음
                    sidogu = address.split()[:2]  # 슬라이싱으로 처음부터 2개만 뽑음
                    results.append((name, address) + tuple(sidogu))
            #print(results)
                except Exception as e:
                    print('%s : %s' % (e, datetime.now()), file=sys.stderr)

    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(
        v, v))  # v에 sido값을 주고 그 값을 리턴값으로 변경, 다르지 아느면 그냥 내비둠
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv(
        '{0}/kyuchon_table.csv'.format(RESULT_DIRECTORY),  #csv로 디렉토리에 저장
        encoding='utf-8',
        mode='w',
        index=True)
Beispiel #23
0
def crawling_kyochon():
    results = []
    for sido1 in range(1, 18):
        for sido2 in count(start=1):
            try:

                url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % (
                    sido1, sido2)
                html = cw.crawling(url=url)

                bs = BeautifulSoup(html, 'html.parser')
                tag_table = bs.find('div', attrs={'class': 'shopSchList'})
                # print(tag_table)
                tags_li = tag_table.findAll('li')
                # print('tag_tbody:',tag_tbody)
                #tags_dl = tag_tbody.findAll('dl')
                for tag_li in tags_li:
                    strings = list(tag_li.strings)
                    print('strings', strings)
                    name = strings[3]
                    address = strings[5]
                    address = address.strip()
                    sidogu = address.split()[:2]
                    results.append((name, address) + tuple(sidogu))

            except:
                break
        # store
    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])

    # apply를 통해 lambda함수의 v값이 들어와서 sido를 v로 채움
    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/kyonchon_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #24
0
def crawling_kyochon(
        err=lambda e: print('%s : %s' % (e, datetime.now()), file=sys.stderr)):
    results = []
    for sido1 in range(1, 17):
        for sido2 in count(start=1):
            print(sido1, ", ", sido2, " :", end=" ")
            url = 'http://www.kyochon.com/shop/domestic.asp?sido1=' + str(
                sido1) + '&sido2=' + str(sido2) + '&txtsearch='
            html = crawling(url=url)

            try:
                bs = BeautifulSoup(html, 'html.parser')
                tag_div = bs.find('div', attrs={'class': 'shopSchList'})
                tags_dl = tag_div.findAll('dl')
                # 마지막 페이지
                if len(tags_dl) == 0:
                    break

                for tag_dl in tags_dl:
                    name = tag_dl.find('dt').text
                    address = tag_dl.find('dd').text.strip().replace(
                        "\t", "").split("\r\n")[0]
                    sido = address.split()[0]
                    gungu = address.split()[1]

                    results.append((name, address, sido, gungu))
            except AttributeError as e:
                err(e)

    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])
    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))
    table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #25
0
def crawling_kyochon():
    results = []
    for sido1 in range(1, 18):
        for sido2 in count(start=1):
            url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' %(sido1, sido2)
            html = cw.crawling(url=url)
            if html == None:
                break
            bs = BeautifulSoup(html, 'html.parser')
            tag_li = bs.find('div', attrs={'class':"shopSchList"})
            tag_ul = tag_li.find('ul')
            tags_li = tag_ul.findAll('li')
            for tag_li in tags_li:
                strings = list(tag_li.strings)
                if strings[0] == '검색결과가 없습니다.':
                    break
                name = strings[3]
                address = strings[6].strip().replace('(','').replace(')','').replace(' ', '')
                sidogu = strings[5].split()[:2]
                results.append((name, address) + tuple(sidogu))
    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])
    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))
    table.to_csv('{0}/kyochon_table2.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
Beispiel #26
0
def crawling_kyochon():
    results = []

    for sido1 in range(1, 18):
        for sido2 in count(start=1):
            print("sido1 === ", sido1)
            print("sido2 === ", sido2)
            url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % (sido1, sido2)
            # print("url===",url)
            html = cw.crawling(url=url)
            # print("html === ",html)

            if html == None:
                print("파일없어요")
                break

            bs = BeautifulSoup(html, 'html.parser')
            # print("bs === ", bs)

            tag_div = bs.find('div', attrs={"class" : "shopSchList"})
            # print("tag_table === ", tag_div)
            tag_ul = tag_div.find('ul', attrs={"class" : "list"})
            # print("tag_ul === ", tag_ul)
            # tag_li = tag_ul.find('li')
            # print("tag_li === ", tag_li)
            tag_dl = tag_ul.findAll('dl')
            # print("tag_dl === ", tag_dl)
            # tag_dt = tag_dl.find('dt')
            # print("tag_dt === ", tag_dt)
            # tag_dd = tag_dl.findAll('dd')
            # print("tag_dd === ", tag_dd)

            for dl in tag_dl:
                # print("dl ==== loop ",dl)
                try:
                    strings = list(dl.strings)
                    print("strings === ", strings)

                    print("strings[1] =====", strings[1])
                    name = strings[1] + "점"
                    address = strings[3]
                    # print(strings[3])
                    address_after = re.sub("[\rnt]", "", address)
                    # print("address_after === ",address_after)
                    address_strip = address_after.strip()
                    print("address_after.strip() === ", address_strip)
                    sidogu = address.split()[:2]
                    print("sidogu === ", sidogu)

                    results.append((name, address_strip) + tuple(sidogu))

                    print("results === ", results)
                except Exception as e:
                    print("오류 === ", e)
                    continue



    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])
    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv(  # 파일 저장
        '{0}/kyochon_table.csv'.format(RESULT_DIRECTORY),
        encoding='utf-8',
        mode='w',
        index=True)