def getProblems(isDaily=False): # crawling한 문제들을 json 방식으로 return if isDaily: problems = crawling() else: problems = crawlProblem() + crawling() pros_ = [] # json list for problem in problems: classify = [problem[7]] is_samsung = problem[8] flag = 0 for gpro_ in pros_: if int(problem[0]) == gpro_['number']: print(problem[0], end=" ") print(classify) classify += gpro_['classify'] is_samsung |= gpro_['is_samsung'] flag = 1 if flag == 0: pro_ = \ { 'number': int(problem[0]), # 문제 번호 'subject': problem[1], # 문제 제목 'info': problem[2], # 문제 정보 : 정보태그 'cor': int(problem[3]), # 정답 횟수 'total': int(problem[4]), # 제출 횟수 'ratio': float(problem[5][:-1]) / 100.0, # 정답 비율 'link': problem[6], # 문제 링크 'classify': classify, # 분류 : DFS, BFS, ... 'is_samsung': problem[8] # 삼성 기출문제 여부 } pros_.append(pro_) return pros_
def crawling_pelicana(): results = [] for page in count(1, ): html = crawling( 'https://pelicana.co.kr/store/stroe_search.html?branch_name=&gu=&si=&page=%d' % page) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class': 'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') # 끝페이지 검출 if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] sidogu = strings[3].split()[:2] results.append((name, ) + tuple(sidogu)) # store table = DataFrame(results, columns=['name', 'sido', 'gugun']) table.to_csv('results/pelicana.csv', encoding='utf-8', mode='w', index=True)
def crawling_kyochon(): results = [] for sido1 in range(1, 18): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % ( sido1, sido2) html = crawling(url) if html is None: break bs = BeautifulSoup(html, 'html.parser') tag_ul = bs.find('ul', attrs={'class': 'list'}) tags_a = tag_ul.findAll('a') for tag_a in tags_a: tag_strong = tag_a.find('strong') if tag_strong is None: break name = tag_strong.text strings = list(tag_a.find('em').strings) address = strings[0].strip('\r\n\t') sidogu = address.split()[:2] results.append((name, ) + tuple(sidogu)) # store table = DataFrame(results, columns=['name', 'sido', 'gugun']) table.to_csv('results/kyochon.csv', encoding='utf-8', mode='w', index=True)
def crawling_kyochon(): result = [] for sido1 in range(1, 18): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?txtsearch=&sido1=%d&sido2=%d' % ( sido1, sido2) html = crawler.crawling(url=url) if html is None: break bs = BeautifulSoup(html, 'html.parser') tag_ul = bs.find('ul', attrs={'class': 'list'}) for tag_a in tag_ul.findAll('a', href=True): name = tag_a.find('dt').get_text() address = tag_a.find('dd').get_text().strip().split('\r')[0] sidogu = address.split()[:2] result.append((name, address) + tuple(sidogu)) # store table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) # table = table.reset_index().drop_duplicates(subset='name', keep='first').set_index('index') table = table.drop_duplicates( subset='name', keep='first').reset_index(drop=True).reset_index().set_index('index') table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding="utf-8", mode='w', index=True)
def crawling_nene(): results = [] first_shopname_prevpage = '' for page in count(start=1): html = crawling( 'https://nenechicken.com/17_new/sub_shop01.asp?ex_select=1&ex_select2=&IndexSword=&GUBUN=A&page=%d' % page) bs = BeautifulSoup(html, 'html.parser') tags_div = bs.findAll('div', attrs={'class': 'shopInfo'}) # 끝페이지 검출 shopname = tags_div[0].find('div', attrs={'class': 'shopName'}).text if first_shopname_prevpage == shopname: break first_shopname_prevpage = shopname for tag_div in tags_div: name = tag_div.find('div', attrs={'class': 'shopName'}).text address = tag_div.find('div', attrs={'class': 'shopAdd'}).text sidogu = address.split()[:2] results.append((name, ) + tuple(sidogu)) # store table = DataFrame(results, columns=['name', 'sido', 'gugun']) table.to_csv('results/nenne.csv', encoding='utf-8', mode='w', index=True)
def only_crawling(): # 크롤러 실행 article_data = crawler.crawling() # elasticsearh 크롤링 원문 데이터 저장 # sotre_index = input("엘라스틱 서치에 저장 할 index 이름을 입력하시오 : ") es.store("olympic", article_data)
def get_answer(question): question_post = tagger.postagging(question) question_part = [] for q in question_post: if q[1] in ("NNP", "NNG", "NNB", "NP") : question_part.append(q[0]) lines = crawler.crawling(question,question_part) entity = {} for line in lines: #print "### " + line line = line.replace("["," ").replace("]"," ").replace("“", "\"").replace("”","\"").replace("*", " ").replace("’", "'").replace("‘","'").replace("(","(").replace(":",":").replace("?"," ").replace("•", " ") tags = tagger.postagging_grouped(line) for tag in tags: #if tag[1] in ("NNP", "NNG", "NNB", "NP"): try: if len(tag) > 1 and (tag[1] in ("NNP", "NNG", "NP")): #) or tag[1] == ""): #print "[[%s : %s]]" % (tag[0], tag[1]) if entity.has_key(tag[0]): entity[tag[0]] += 1 else: entity.update({tag[0]: 1}) except: print tag #break; max = 0 ans = [] for word, count in entity.items(): if word is not None and count is not None and word not in question_part: ans.append((word, count)) from operator import itemgetter ans = sorted(ans, reverse=True, key=itemgetter(1)) ans = ans[:10] base_part = question_part[len(question_part)-1] print "base : [%s]" % base_part, result = rank.pmi_tuple(base_part, ans) # # for word, count in entity.items(): # if word in origin: # continue # if count > max: # max = count # ans = [] # ans.append(word) # elif count == max: # ans.append(word) return result
def main(): try: a = input() contant = fo.contant_init() s_ready = sp.crawler_init(contant) if s_ready is False: sys.exit() dataframe = sp.crawling() fo.write2file(dataframe) sys.exit() except Exception as e: print(e) temp = input() sys.exit()
def crawling_pelicana(): result = [] for page in count(start=1): url = 'http://www.pelicana.co.kr/store/stroe_search.html?page=%d' % page html = crawler.crawling(url) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class': 'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[3] sidogu = address.split()[:2] result.append((name, address) + tuple(sidogu)) table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu']) # 중복 제거 table = table.\ drop_duplicates(subset='name', keep='first').\ reset_index(drop=True) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/pelicana_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawl_kyochon(): result = [] for sido1 in range(1, 18): for sido2 in count(start=1): url = 'http://www.kyochon.com/shop/domestic.asp?sido1=%d&sido2=%d&txtsearch=' % ( sido1, sido2) html = crawler.crawling(url=url) if html is None: break bs = BeautifulSoup(html, 'html.parser') tag_ul = bs.find('ul', attrs={'class': 'list'}) for tag_a in tag_ul.findAll('a'): tag_dt = tag_a.find('dt') if tag_dt is None: break name = tag_dt.get_text() tag_dd = tag_a.find('dd') if tag_dd is None: break address = tag_dd.get_text().strip().split('\r')[0] sidogu = address.split()[:2] result.append((name, address) + tuple(sidogu)) table = pd.DataFrame(result, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: sido_dict.get(v, v)) table['gungu'] = table.gungu.apply(lambda v: gungu_dict.get(v, v)) table.to_csv('{0}/kyochon_table.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_nene(): results = [] first_shopname_prevpage = '' for page in count(start=1): html = crawling( 'https://nenechicken.com/17_new/sub_shop01.asp?page={page}&ex_select=1&ex_select2=&IndexSword=&GUBUN=A' .format_map({'page': page})) # print(html) bs = BeautifulSoup(html, 'html.parser') tag_div = bs.find('div', attrs='shopWrap') tags_div_shop = tag_div.findAll('div', attrs={'class': 'shopInfo'}) # 끝페이지 검출 shopname = tags_div_shop[0].find('div', attrs={ 'class': 'shopName' }).text if first_shopname_prevpage == shopname: break first_shopname_prevpage = shopname for tag_div_shop in tags_div_shop: name = tag_div_shop.find('div', attrs={'class': 'shopName'}).text address = tag_div_shop.find('div', attrs={'class': 'shopAdd'}).text sidogu = address.split()[:2] results.append((name, ) + tuple(sidogu)) # name = strings[1] # address = strings[3] # # print(name,address) # sidogu = strings[3].split()[:2] print(results) # store table = DataFrame(results, columns=['name', 'sido', 'gugun']) table.to_csv('results/table_nene.csv', encoding='utf-8', mode='w', index=True)
def lambda_handler(event, context): try: cursor = conn.cursor() # SelectAll cursor.execute('SELECT * FROM CHANNEL') channelIdList = cursor.fetchall() for channelId in channelIdList: channel = crawling(channelId[0]) logger.info(channel) id = channelId[0] title = channel['title'] content = channel['content'] image = channel['image'] joinDate = channel['joinDate'] subscriber = channel['subscriber'] views = channel['views'] updatedTime = channel['updatedTime'] # PostgreSQL Default Port Number sql = 'UPDATE channel SET title=%s, content=%s, subscriber=%s, image=%s, views=%s, join_date=%s, updated_time=%s WHERE id=%s' # 쿼리 출력 logger.info( cursor.mogrify(sql, (title, content, subscriber, image, views, joinDate, updatedTime, id))) # 쿼리 실행 cursor.execute(sql, (title, content, subscriber, image, views, joinDate, updatedTime, id)) # 커밋 conn.commit() except Exception as e: logger.error(e)
def crawling_nene(): result = [] prevShopName = '' nextShopName = '' for page in count(start=1): url = 'https://nenechicken.com/17_new/sub_shop01.asp?page=%d&ex_select=1&ex_select2=&IndexSword=&GUBUN=A' % page html = crawling(url) bs = BeautifulSoup(html, 'html.parser') div = bs.find('div', attrs={'class': 'shopWrap'}) shops = div.findAll('div', attrs={'class': 'shop'}) nextShopName = shops[0].find('div', attrs={'class': 'shopName'}).text if prevShopName == nextShopName: print(prevShopName, nextShopName) print("=======================break") break else: print(prevShopName, nextShopName) prevShopName = nextShopName for shop in shops: name = shop.find('div', attrs={'class': 'shopName'}).text address = shop.find('div', attrs={'class': 'shopAdd'}).text sidogu = address.split()[:2] result.append((name, ) + tuple(sidogu)) # store table = DataFrame(result, columns=['name', 'sido', 'gugun']) table.to_csv('results/table_nene.csv', encoding='utf-8', mode='w', index=True)
def main(): # 크롤러 실행 article_data = crawler.crawling() # elasticsearh 크롤링 원문 데이터 저장 sotre_index = input("엘라스틱 서치에 저장 할 index 이름을 입력하시오 : ") es.store(sotre_index, article_data) # elastiesarch index 검색 search_index = input("엘라스틱 서치에서 검색 할 index 이름을 입력하시오 : ") index = es.search(search_index) # es에서 검색한 결과 data_list = es.convert_to_list(index) # es _source(data value) 만 가져와서 list로 변환 # datapreprocessing 1. 형태소 분석 2.명사 추출 2-1. 불용어 처리 # 1. 형태소 분석 # data_preprocessing.m_analysis(data_list) # 2. 명사 추출 nouns_list = data_preprocessing.noun_extraction(data_list) # 2-1. 불용어 처리 ( 명사 추출 한 결과) result = data_preprocessing.stopword(nouns_list) # result store in mysql - 불용어 처리 결과 저장 mysql.nouns_store(result) # tf 계산 words = mysql.search_in_dataResult() # tf 계산하기 위한 noun column만 가져오기 df_tf = tfidf.cal_tf(words) # tf 값 계산 mysql.store_tf_value(df_tf) # tf dataframe(id, noun, count) 저장 # TFIDF vector - sklearn # corpus = tfidf.make_list_for_tfidf(words) # tfidf.cal_vector(corpus) # ngram - top word 연관검색어 함수 실행 realted_keyword()
import crawler def proc_bbq(html): pass def store_bbq(data): pass if __name__ == '__main__': # collection crawler.crawling( url= 'https://www.bbq.co.kr/shop/shop_ajax.asp?page=1&pagesize=2000&gu=&si=', proc=proc_bbq, store=store_bbq)
args = parser.parse_args() existed_video = [] # path_download = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.download) # path_output = os.path.join(os.path.dirname(os.path.abspath(__file__)), args.out_folder) if not os.path.exists(args.download): os.makedirs(args.download) if not os.path.exists(args.out_folder): os.makedirs(args.out_folder) existed_video = os.listdir(args.out_folder) video_ids = crawling(args.keyword, args.num_video) video_ids = [x for x in video_ids if x not in existed_video] for video in video_ids: print(video) try: download(video, args.download) # 동영상 다운로드 except: continue run(args.download, args.accuracy, args.image_shape, args.out_folder, video, args.class_name) if os.path.exists(os.path.join(args.download, video + ".mp4")): os.remove(os.path.join(args.download, video + ".mp4")) #동영상 삭제
def test_crawling(): crawler.crawling("Videogames", test_seeds_videogames, 600)
# table = table.reset_index().drop_duplicates(subset='name', keep='first').set_index('index') table = table.drop_duplicates( subset='name', keep='first').reset_index(drop=True).reset_index().set_index('index') table.to_csv('{0}/goobne_table.csv'.format(RESULT_DIRECTORY), encoding="utf-8", mode='w', index=True) if __name__ == '__main__': # bbq collection crawler.crawling( url= 'https://www.bbq.co.kr/shop/shop_ajax.asp?page=1&pagesize=2000&gu=&si=', proc=proc_bbq, store=store_bbq) # pelicana collection crawling_pelicana() # nene collection crawler.crawling( url= 'http://nenechicken.com/subpage/where_list.asp?target_step2=%s&proc_type=step1&target_step1=%s' % (urllib.parse.quote('전체'), urllib.parse.quote('전체')), proc=proc_nene, store=store_nene) # kyochon collection crawling_kyochon()