def crawling_kyochon():
    result = []

    for sido1 in range(1, 18):
        for sido2 in count(start=1):
            url = 'http://www.kyochon.com/shop/domestic.asp?txtsearch=&sido1=%d&sido2=%d' % (
                sido1, sido2)
            html = crawler.crawling(url=url)

            if html is None:
                break

            bs = BeautifulSoup(html, 'html.parser')
            tag_ul = bs.find('ul', attrs={'class': 'list'})

            for tag_a in tag_ul.findAll('a', href=True):
                name = tag_a.find('dt').get_text()
                address = tag_a.find('dd').get_text().strip().split('\r')[0]
                sidogu = address.split()[:2]
                result.append((name, address) + tuple(sidogu))

    # store
    table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    # table = table.reset_index().drop_duplicates(subset='name', keep='first').set_index('index')
    table = table.drop_duplicates(
        subset='name',
        keep='first').reset_index(drop=True).reset_index().set_index('index')
    table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY),
                 encoding="utf-8",
                 mode='w',
                 index=True)
def store_kyochon(data):

    table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY))
def store_nene(data):
    table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/nene_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
def store_nene(data):
    table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    # table = table.reset_index().drop_duplicates(subset='name', keep='first').set_index('index')
    table = table.drop_duplicates(
        subset='name',
        keep='first').reset_index(drop=True).reset_index().set_index('index')
    table.to_csv('{0}/nene_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
Ejemplo n.º 5
0
def crawl_goobne():
    url = 'http://www.goobne.co.kr/store/search_store.jsp'

    wd = webdriver.Chrome('D:/Python/webdriver/chromedriver.exe')
    wd.get("http://www.goobne.co.kr/store/search_store.jsp")
    time.sleep(5)

    result = []
    for page in count(start=1):
        script = 'store.getList(%d)' % page
        wd.execute_script(script)
        print('%s : success for script execution (%s)' %
              (datetime.now(), script))
        time.sleep(5)

        html = wd.page_source
        bs = BeautifulSoup(html, 'html.parser')

        tag_tbody = bs.find('tbody', attrs={'id': 'store_list'})
        tags_tr = tag_tbody.findAll('tr')

        if tags_tr[0].get('class') is None:
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)

            name = strings[1]
            address = strings[5] if strings[3] == '' else strings[6]
            sidogu = address.split()[:2]
            result.append((name, address) + tuple(sidogu))

    wd.quit()

    # store
    table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu'])

    # 중복 제거
    table = table.\
        drop_duplicates(subset='name', keep='first').\
        reset_index(drop=True)

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/goobne_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
def crawling_pelicana():
    results = []
    RESULT_DIRECTORY = '__result__'
    for page in count(start=1):
        # for page in range(1, 3):
        url = 'http://pelicana.co.kr/store/stroe_search.html?gu=&si=&page={0}'.format(
            page)
        html = cw.crawling(url=url)
        # print(html)

        bs = BeautifulSoup(html, 'html.parser')
        tag_table = bs.find('table', attrs={'class': 'table mt20'})
        # print(bs, file=sys.stderr)
        # print(tag_table, file=sys.stderr)

        tag_tbody = tag_table.find('tbody')
        # print(tag_tbody)
        tags_tr = tag_tbody.findAll('tr')
        # print(tags_tr)

        # print(tags_tr)
        print(page, ":", len(tags_tr), sep=':')

        # 끝 검출
        if len(tags_tr) == 0:
            break

        # print(page, ":", len(tags_tr), sep=':')

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)
            # print(strings, type(strings))
            name = strings[1]
            address = strings[3]
            # print(address.split())
            sidogu = address.split()[:2]

            results.append((name, address) + tuple(sidogu))

    # store
    table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu'])
    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))
    table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY))
Ejemplo n.º 7
0
def crawling_pelicana():

    result = []

    for page in count(start=1):

        url = 'http://www.pelicana.co.kr/store/stroe_search.html?page=%d' % page
        html = crawler.crawling(url)

        bs = BeautifulSoup(html, 'html.parser')
        tag_table = bs.find('table', attrs={'class': 'table mt20'})

        tag_tbody = tag_table.find('tbody')
        tags_tr = tag_tbody.findAll('tr')

        if len(tags_tr) == 0:
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)

            name = strings[1]
            address = strings[3]
            sidogu = address.split()[:2]

            result.append((name, address) + tuple(sidogu))

    table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu'])

    # 중복 제거
    table = table.\
        drop_duplicates(subset='name', keep='first').\
        reset_index(drop=True)

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
def crawl_kyochon():

    result = []

    for sido1 in range(1, 18):
        for sido2 in count(start=1):
            url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % (
                sido1, sido2)
            html = crawler.crawling(url=url)

            if html is None:
                break

            bs = BeautifulSoup(html, 'html.parser')
            tag_ul = bs.find('ul', attrs={'class': 'list'})

            for tag_a in tag_ul.findAll('a'):
                tag_dt = tag_a.find('dt')
                if tag_dt is None:
                    break

                name = tag_dt.get_text()

                tag_dd = tag_a.find('dd')
                if tag_dd is None:
                    break

                address = tag_dd.get_text().strip().split('\r')[0]
                sidogu = address.split()[:2]
                result.append((name, address) + tuple(sidogu))

    table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)