def getProblems(isDaily=False):  #  crawling한 문제들을 json 방식으로 return
    if isDaily:
        problems = crawling()
    else:
        problems = crawlProblem() + crawling()
    pros_ = []  # json list
    for problem in problems:
        classify = [problem[7]]
        is_samsung = problem[8]
        flag = 0
        for gpro_ in pros_:
            if int(problem[0]) == gpro_['number']:
                print(problem[0], end=" ")
                print(classify)
                classify += gpro_['classify']
                is_samsung |= gpro_['is_samsung']
                flag = 1
        if flag == 0:
            pro_ = \
            {
                'number': int(problem[0]),  # 문제 번호
                'subject': problem[1],   # 문제 제목
                'info': problem[2],   # 문제 정보 : 정보태그
                'cor': int(problem[3]),   # 정답 횟수
                'total': int(problem[4]),   # 제출 횟수
                'ratio': float(problem[5][:-1]) / 100.0,   # 정답 비율
                'link': problem[6],  # 문제 링크
                'classify': classify, # 분류 : DFS, BFS, ...
                'is_samsung': problem[8]  # 삼성 기출문제 여부
            }
            pros_.append(pro_)
    return pros_
Beispiel #2
0
def crawling_pelicana():
    results = []

    for page in count(1, ):

        html = crawling(
            'https://pelicana.co.kr/store/stroe_search.html?branch_name=&gu=&si=&page=%d'
            % page)
        bs = BeautifulSoup(html, 'html.parser')

        tag_table = bs.find('table', attrs={'class': 'table mt20'})
        tag_tbody = tag_table.find('tbody')
        tags_tr = tag_tbody.findAll('tr')

        # 끝페이지 검출
        if len(tags_tr) == 0:
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)
            name = strings[1]
            sidogu = strings[3].split()[:2]
            results.append((name, ) + tuple(sidogu))

    # store
    table = DataFrame(results, columns=['name', 'sido', 'gugun'])
    table.to_csv('results/pelicana.csv',
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #3
0
def crawling_kyochon():
    results = []

    for sido1 in range(1, 18):
        for sido2 in count(start=1):
            url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % (
                sido1, sido2)
            html = crawling(url)
            if html is None:
                break

            bs = BeautifulSoup(html, 'html.parser')
            tag_ul = bs.find('ul', attrs={'class': 'list'})
            tags_a = tag_ul.findAll('a')
            for tag_a in tags_a:
                tag_strong = tag_a.find('strong')
                if tag_strong is None:
                    break

                name = tag_strong.text
                strings = list(tag_a.find('em').strings)
                address = strings[0].strip('\r\n\t')
                sidogu = address.split()[:2]
                results.append((name, ) + tuple(sidogu))
    # store
    table = DataFrame(results, columns=['name', 'sido', 'gugun'])
    table.to_csv('results/kyochon.csv', encoding='utf-8', mode='w', index=True)
def crawling_kyochon():
    result = []

    for sido1 in range(1, 18):
        for sido2 in count(start=1):
            url = 'http://www.kyochon.com/shop/domestic.asp?txtsearch=&sido1=%d&sido2=%d' % (
                sido1, sido2)
            html = crawler.crawling(url=url)

            if html is None:
                break

            bs = BeautifulSoup(html, 'html.parser')
            tag_ul = bs.find('ul', attrs={'class': 'list'})

            for tag_a in tag_ul.findAll('a', href=True):
                name = tag_a.find('dt').get_text()
                address = tag_a.find('dd').get_text().strip().split('\r')[0]
                sidogu = address.split()[:2]
                result.append((name, address) + tuple(sidogu))

    # store
    table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    # table = table.reset_index().drop_duplicates(subset='name', keep='first').set_index('index')
    table = table.drop_duplicates(
        subset='name',
        keep='first').reset_index(drop=True).reset_index().set_index('index')
    table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY),
                 encoding="utf-8",
                 mode='w',
                 index=True)
Beispiel #5
0
def crawling_nene():
    results = []
    first_shopname_prevpage = ''

    for page in count(start=1):
        html = crawling(
            'https://nenechicken.com/17_new/sub_shop01.asp?ex_select=1&ex_select2=&IndexSword=&GUBUN=A&page=%d'
            % page)
        bs = BeautifulSoup(html, 'html.parser')

        tags_div = bs.findAll('div', attrs={'class': 'shopInfo'})

        # 끝페이지 검출
        shopname = tags_div[0].find('div', attrs={'class': 'shopName'}).text
        if first_shopname_prevpage == shopname:
            break

        first_shopname_prevpage = shopname

        for tag_div in tags_div:
            name = tag_div.find('div', attrs={'class': 'shopName'}).text
            address = tag_div.find('div', attrs={'class': 'shopAdd'}).text
            sidogu = address.split()[:2]
            results.append((name, ) + tuple(sidogu))

    # store
    table = DataFrame(results, columns=['name', 'sido', 'gugun'])
    table.to_csv('results/nenne.csv', encoding='utf-8', mode='w', index=True)
Beispiel #6
0
def only_crawling():

    # 크롤러 실행
    article_data = crawler.crawling()

    # elasticsearh 크롤링 원문 데이터 저장
    # sotre_index = input("엘라스틱 서치에 저장 할 index 이름을 입력하시오 :  ")
    es.store("olympic", article_data)
Beispiel #7
0
def get_answer(question):

    question_post = tagger.postagging(question)
    question_part = []
    for q in question_post:
        if q[1] in ("NNP", "NNG", "NNB", "NP") :
            question_part.append(q[0])

    lines = crawler.crawling(question,question_part)

    entity = {}

    for line in lines:
        #print "### " + line
        line = line.replace("["," ").replace("]"," ").replace("“", "\"").replace("”","\"").replace("*", " ").replace("’", "'").replace("‘","'").replace("(","(").replace(":",":").replace("?"," ").replace("•", " ")

        tags = tagger.postagging_grouped(line)
        for tag in tags:
            #if tag[1] in ("NNP", "NNG", "NNB", "NP"):
            try:
                if len(tag) > 1 and (tag[1] in ("NNP", "NNG", "NP")): #) or tag[1] == ""):
                #print "[[%s : %s]]" % (tag[0], tag[1])
                    if entity.has_key(tag[0]):
                        entity[tag[0]] += 1
                    else:
                        entity.update({tag[0]: 1})
            except:
                print tag
                #break;



    max = 0
    ans = []
    for word, count in entity.items():
        if word is not None and count is not None and word not in question_part:
            ans.append((word, count))

    from operator import itemgetter
    ans = sorted(ans, reverse=True, key=itemgetter(1))

    ans = ans[:10]


    base_part = question_part[len(question_part)-1]
    print "base : [%s]" % base_part,
    result = rank.pmi_tuple(base_part, ans)
#
#    for word, count in entity.items():
#        if word in origin:
#            continue
#        if count > max:
#            max = count
#            ans = []
#            ans.append(word)
#        elif count == max:
#            ans.append(word)
    return result
Beispiel #8
0
def main():
    try:
        a = input()
        contant = fo.contant_init()
        s_ready = sp.crawler_init(contant)
        if s_ready is False:
            sys.exit()
        dataframe = sp.crawling()
        fo.write2file(dataframe)
        sys.exit()
    except Exception as e:
        print(e)
        temp = input()
        sys.exit()
def crawling_pelicana():

    result = []

    for page in count(start=1):

        url = 'http://www.pelicana.co.kr/store/stroe_search.html?page=%d' % page
        html = crawler.crawling(url)

        bs = BeautifulSoup(html, 'html.parser')
        tag_table = bs.find('table', attrs={'class': 'table mt20'})

        tag_tbody = tag_table.find('tbody')
        tags_tr = tag_tbody.findAll('tr')

        if len(tags_tr) == 0:
            break

        for tag_tr in tags_tr:
            strings = list(tag_tr.strings)

            name = strings[1]
            address = strings[3]
            sidogu = address.split()[:2]

            result.append((name, address) + tuple(sidogu))

    table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu'])

    # 중복 제거
    table = table.\
        drop_duplicates(subset='name', keep='first').\
        reset_index(drop=True)

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
def crawl_kyochon():

    result = []

    for sido1 in range(1, 18):
        for sido2 in count(start=1):
            url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % (
                sido1, sido2)
            html = crawler.crawling(url=url)

            if html is None:
                break

            bs = BeautifulSoup(html, 'html.parser')
            tag_ul = bs.find('ul', attrs={'class': 'list'})

            for tag_a in tag_ul.findAll('a'):
                tag_dt = tag_a.find('dt')
                if tag_dt is None:
                    break

                name = tag_dt.get_text()

                tag_dd = tag_a.find('dd')
                if tag_dd is None:
                    break

                address = tag_dd.get_text().strip().split('\r')[0]
                sidogu = address.split()[:2]
                result.append((name, address) + tuple(sidogu))

    table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu'])

    table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v))
    table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v))

    table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY),
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #11
0
def crawling_nene():
    results = []
    first_shopname_prevpage = ''
    for page in count(start=1):
        html = crawling(
            'https://nenechicken.com/17_new/sub_shop01.asp?page={page}&ex_select=1&ex_select2=&IndexSword=&GUBUN=A'
            .format_map({'page': page}))
        # print(html)
        bs = BeautifulSoup(html, 'html.parser')

        tag_div = bs.find('div', attrs='shopWrap')
        tags_div_shop = tag_div.findAll('div', attrs={'class': 'shopInfo'})
        # 끝페이지 검출
        shopname = tags_div_shop[0].find('div', attrs={
            'class': 'shopName'
        }).text
        if first_shopname_prevpage == shopname:
            break

        first_shopname_prevpage = shopname
        for tag_div_shop in tags_div_shop:

            name = tag_div_shop.find('div', attrs={'class': 'shopName'}).text
            address = tag_div_shop.find('div', attrs={'class': 'shopAdd'}).text
            sidogu = address.split()[:2]
            results.append((name, ) + tuple(sidogu))

            # name = strings[1]
            # address = strings[3]
            # # print(name,address)
            # sidogu = strings[3].split()[:2]
    print(results)

    # store
    table = DataFrame(results, columns=['name', 'sido', 'gugun'])
    table.to_csv('results/table_nene.csv',
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #12
0
def lambda_handler(event, context):
    try:
        cursor = conn.cursor()

        # SelectAll
        cursor.execute('SELECT * FROM CHANNEL')
        channelIdList = cursor.fetchall()

        for channelId in channelIdList:
            channel = crawling(channelId[0])

            logger.info(channel)

            id = channelId[0]
            title = channel['title']
            content = channel['content']
            image = channel['image']
            joinDate = channel['joinDate']
            subscriber = channel['subscriber']
            views = channel['views']
            updatedTime = channel['updatedTime']

            # PostgreSQL Default Port Number
            sql = 'UPDATE channel SET title=%s, content=%s, subscriber=%s, image=%s, views=%s, join_date=%s, updated_time=%s WHERE id=%s'

            # 쿼리 출력
            logger.info(
                cursor.mogrify(sql, (title, content, subscriber, image, views,
                                     joinDate, updatedTime, id)))

            # 쿼리 실행
            cursor.execute(sql, (title, content, subscriber, image, views,
                                 joinDate, updatedTime, id))

            # 커밋
            conn.commit()

    except Exception as e:
        logger.error(e)
Beispiel #13
0
def crawling_nene():
    result = []
    prevShopName = ''
    nextShopName = ''
    for page in count(start=1):
        url = 'https://nenechicken.com/17_new/sub_shop01.asp?page=%d&ex_select=1&ex_select2=&IndexSword=&GUBUN=A' % page

        html = crawling(url)
        bs = BeautifulSoup(html, 'html.parser')
        div = bs.find('div', attrs={'class': 'shopWrap'})
        shops = div.findAll('div', attrs={'class': 'shop'})
        nextShopName = shops[0].find('div', attrs={'class': 'shopName'}).text

        if prevShopName == nextShopName:
            print(prevShopName, nextShopName)
            print("=======================break")

            break

        else:
            print(prevShopName, nextShopName)
            prevShopName = nextShopName

        for shop in shops:
            name = shop.find('div', attrs={'class': 'shopName'}).text
            address = shop.find('div', attrs={'class': 'shopAdd'}).text
            sidogu = address.split()[:2]

            result.append((name, ) + tuple(sidogu))

        # store
    table = DataFrame(result, columns=['name', 'sido', 'gugun'])
    table.to_csv('results/table_nene.csv',
                 encoding='utf-8',
                 mode='w',
                 index=True)
Beispiel #14
0
def main():

    # 크롤러 실행
    article_data = crawler.crawling()
    
    # elasticsearh 크롤링 원문 데이터 저장
    sotre_index = input("엘라스틱 서치에 저장 할 index 이름을 입력하시오 :  ")
    es.store(sotre_index, article_data)
    
    # elastiesarch index 검색
    search_index = input("엘라스틱 서치에서 검색 할 index 이름을 입력하시오 :  ")
    index = es.search(search_index) # es에서 검색한 결과
    data_list = es.convert_to_list(index) # es _source(data value) 만 가져와서 list로 변환
    
    # datapreprocessing 1. 형태소 분석 2.명사 추출  2-1. 불용어 처리
    # 1. 형태소 분석
    # data_preprocessing.m_analysis(data_list)
    # 2. 명사 추출
    nouns_list = data_preprocessing.noun_extraction(data_list)
    # 2-1. 불용어 처리 ( 명사 추출 한 결과)
    result = data_preprocessing.stopword(nouns_list)

    # result store in mysql - 불용어 처리 결과 저장
    mysql.nouns_store(result)

    # tf 계산
    words = mysql.search_in_dataResult() # tf 계산하기 위한 noun column만 가져오기
    df_tf = tfidf.cal_tf(words) # tf 값 계산
    mysql.store_tf_value(df_tf) # tf dataframe(id, noun, count) 저장
   
    # TFIDF vector - sklearn
    # corpus = tfidf.make_list_for_tfidf(words)
    # tfidf.cal_vector(corpus)
    
    # ngram - top word 연관검색어 함수 실행
    realted_keyword()
Beispiel #15
0
import crawler


def proc_bbq(html):
    pass


def store_bbq(data):
    pass


if __name__ == '__main__':

    # collection
    crawler.crawling(
        url=
        'https://www.bbq.co.kr/shop/shop_ajax.asp?page=1&pagesize=2000&gu=&si=',
        proc=proc_bbq,
        store=store_bbq)
Beispiel #16
0
    args = parser.parse_args()

    existed_video = []

    # path_download = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.download)
    # path_output = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.out_folder)

    if not os.path.exists(args.download):
        os.makedirs(args.download)

    if not os.path.exists(args.out_folder):
        os.makedirs(args.out_folder)

    existed_video = os.listdir(args.out_folder)

    video_ids = crawling(args.keyword, args.num_video)
    video_ids = [x for x in video_ids if x not in existed_video]

    for video in video_ids:
        print(video)
        try:
            download(video, args.download)  # 동영상 다운로드
        except:
            continue

        run(args.download, args.accuracy, args.image_shape, args.out_folder,
            video, args.class_name)

        if os.path.exists(os.path.join(args.download, video + ".mp4")):
            os.remove(os.path.join(args.download, video + ".mp4"))  #동영상 삭제
def test_crawling():
    crawler.crawling("Videogames", test_seeds_videogames, 600)
    # table = table.reset_index().drop_duplicates(subset='name', keep='first').set_index('index')
    table = table.drop_duplicates(
        subset='name',
        keep='first').reset_index(drop=True).reset_index().set_index('index')
    table.to_csv('{0}/goobne_table.csv'.format(RESULT_DIRECTORY),
                 encoding="utf-8",
                 mode='w',
                 index=True)


if __name__ == '__main__':

    # bbq collection
    crawler.crawling(
        url=
        'https://www.bbq.co.kr/shop/shop_ajax.asp?page=1&pagesize=2000&gu=&si=',
        proc=proc_bbq,
        store=store_bbq)

    # pelicana collection
    crawling_pelicana()

    # nene collection
    crawler.crawling(
        url=
        'http://nenechicken.com/subpage/where_list.asp?target_step2=%s&proc_type=step1&target_step1=%s'
        % (urllib.parse.quote('전체'), urllib.parse.quote('전체')),
        proc=proc_nene,
        store=store_nene)
    # kyochon collection
    crawling_kyochon()