def crawl_auction(songID): try: print(songID) songID = str(songID) db = Sql("khroma") url = "https://www.musicow.com/auction/%s" % (songID) CUSTOM_HEADER['referer'] = url r = requests.get(url, headers=CUSTOM_HEADER) bs = BeautifulSoup(r.text, 'html.parser') text_info = bs.select_one('#tab1').script.text.split(";") profit_raw = re.sub("[A-z=\s]", "", text_info[2]) profit_info = re.sub(".+'", "", profit_raw) print(profit_info) auction = bs.select('dl.price strong') auctionAmount = int(re.sub("\D", "", auction[1].text)) auctionStartPrice = int(re.sub("\D", "", auction[2].text)) #auctionLowPrice = int(re.sub("\D", "", auction[2].text)) #auctionAvgPrice = int(re.sub("\D", "", auction[3].text)) print(auctionStartPrice) info_list = bs.select('div.lst_bul p') share_raw = re.sub("\s", "", info_list[0].text) shares = int(share_raw.replace("1/", "").replace(",", "")) print(shares) db.insert_withoutDuplication('musicow_auction', check_list=['songID'], songID=songID, profit_info=profit_info, shares=shares, auctionAmount=auctionAmount, auctionStartPrice=auctionStartPrice) except Exception as ex: print(ex)
def crawl_deal(): db = Sql("khroma") page = 1 num = 0 while True: print("페이지 : ", page) url_deal = "https://www.musicow.com/auctions?tab=market&keyword=&sortorder=&page=%d" % ( page) CUSTOM_HEADER['referer'] = url_deal r1 = requests.get(url_deal, headers=CUSTOM_HEADER) bs1 = BeautifulSoup(r1.text, 'html.parser') song_list = bs1.select('#list li') for song in song_list: num += 1 url2_add = song.a["href"] dealID = url2_add.split("/")[2].strip() print(dealID) txt = song.select('div.txt dl') title = txt[0].dd.text singer = txt[1].dd.text currentPrice = int(re.sub("\D", "", txt[2].dd.text)) update = db.insert('musicow_deal', dealID=dealID, title=title, singer=singer, currentPrice=currentPrice, num_order=num) page += 1 if len(song_list) == 0: break
def crawl_list(): db = Sql("khroma") page = 1 while True: print("페이지 : ", page) url_list = "https://www.musicow.com/auctions?tab=closed&keyword=&page=" + str( page) CUSTOM_HEADER['referer'] = url_list r1 = requests.get(url_list, headers=CUSTOM_HEADER) bs1 = BeautifulSoup(r1.text, 'html.parser') song_list = bs1.select('ul.user_buy li') for song in song_list: url2_add = song.a["href"] songID = url2_add.split("/")[2].strip() print(songID) txt = song.select('div.txt dl') title = txt[0].dd.text singer = txt[1].dd.text auctionDate = txt[2].dd.text db.insert_withoutDuplication('musicow_list', check_list=['songID'], songID=songID, title=title, singer=singer, auctionDate=auctionDate) crawl_auction(songID) page += 1 if len(song_list) == 0: break
def update_info(): db = Sql("khroma") dealID_data = db.select("musicow_deal", "dealID") dealID_list = set([d["dealID"] for d in dealID_data]) for dealID in dealID_list: try: print(dealID) url = "https://www.musicow.com/song/%s?tab=info" % (dealID) CUSTOM_HEADER['referer'] = url r = requests.get(url, headers=CUSTOM_HEADER) bs = BeautifulSoup(r.text, 'html.parser') title = bs.select_one('strong.song_title').text.strip() singer = bs.select_one('span.artist').text.strip() auction = bs.select('div.row-col-2 dd') auctionAmount = int(re.sub("\D", "", auction[0].text)) auctionStartPrice = int(re.sub("\D", "", auction[1].text)) auctionLowPrice = int(re.sub("\D", "", auction[2].text)) auctionAvgPrice = int(re.sub("\D", "", auction[3].text)) db.insert_withoutDuplication('musicow_info', check_list=['dealID'], dealID=dealID, title=title, singer=singer, auctionAmount1=auctionAmount, auctionStartPrice1=auctionStartPrice, auctionLowPrice1=auctionLowPrice, auctionAvgPrice1=auctionAvgPrice) except Exception as ex: print(ex)
class Data: def __init__(self, dbName): self.db = Sql(dbName) self.df = pd.DataFrame() def drop_duplicates(self, subset): self.df = self.df.drop_duplicates(subset=subset) def shape(self): return self.df.shape def setDB( self, dbName, ): pass def get_df(self, *colnames, by_sentence_textColname=None): ''' :param colnames: 행이름 str :param by_sentence_textColname: 문장 분해 대상 text 행이름 :return: DataFrame ''' df_documents = self.df.loc[:, list(colnames)] if by_sentence_textColname: df_sentences = pd.DataFrame() nrows = df_documents.shape[0] for i in tqdm(range(nrows), "loader : Getting Sentences "): row = df_documents.iloc[i] text = row[by_sentence_textColname] if len(text) > 0: text = cleanse_text(text) sentences = kss.split_sentences( text) #텍스트 길이 300 넘는게 허다하게 나옴... 체크 필요함 for s in sentences: s = cleanse_sentence(s) if len(s) > 0: row_temp = row.copy() row_temp[by_sentence_textColname] = s df_sentences = df_sentences.append(row_temp) else: continue print( f"loader : Getting DataFrame Done {nrows} Documents to {df_sentences.shape[0]} Sentences" ) return df_sentences else: return df_documents def _getSentence_(self, text_seq): pass def addData(self, channel, keyword, fromDate, toDate, tablename, dbfnameChannel, dbfnameKeyword, dbfnamePostDate, drop_duplicate_by=None): ''' :param channel: str :param keyword: str :param fromDate: 'yyyy-mm-dd' :param toDate: 'yyyy-mm-dd' :param tablename: str :param drop_duplicate_by: 중복 제거 행이름 list. e.g.['keyword', 'url'] :return: ''' nrows0 = self.df.shape[0] ldf = self._load_(channel, keyword, fromDate, toDate, tablename, dbfnameChannel, dbfnameKeyword, dbfnamePostDate) print(ldf) nrowsldf = ldf.shape[0] self.df = self.df.append(ldf) addednRows = nrowsldf droppednRows = 0 if drop_duplicate_by: self.df.drop_duplicates(subset=drop_duplicate_by) addednRows = self.df.shape[0] - nrows0 droppednRows = nrowsldf - addednRows print( f'addData : added {addednRows} rows (dropped {droppednRows} rows)') def _load_(self, channel, keyword, fromDate, toDate, tablename, dbfnameChannel, dbfnameKeyword, dbfnamePostDate): where_str = f"{dbfnameKeyword}='{keyword}' and {dbfnameChannel}='{channel}' and {dbfnamePostDate} between '{fromDate}' and '{toDate}'" df = self.db.select(tablename, "*", where_str, asDataFrame=True) return df
def __init__(self, dbName): self.db = Sql(dbName) self.df = pd.DataFrame()
def crawl(keyword, productURL, productName, comment="navershopping"): #db명 db = Sql('dalmaden') task_id = db.insert('task_log', comment=comment) driver = SeleniumDriver() driver.get(productURL) #직접 스크롤 다운해서 리뷰 페이지 표출 input("직접 스크롤 다운해서 리뷰 페이지 표출") #아래 코드 실행 maxPage = 10 num = 0 while True: #getcontents #리뷰 컨테이너 ele = driver.driver.find_element_by_css_selector( "#area_review_list .detail_list_review") ele.text #리뷰 리스트 ele = ele.find_elements_by_css_selector('li') for e in ele: print("################################") try: num += 1 print(e.text) channel = 'navershopping' text_info = e.find_elements_by_css_selector( 'div.area_status_user span') author, date_raw, option = "", "", "" try: author = text_info[0].text date_raw = '20' + text_info[1].text date_lst = date_raw.split(".") post_date = "-".join(date_lst[:-1]) option = e.find_element_by_css_selector( 'p.text_option').text except: pass text = e.find_element_by_css_selector('span.text').text text = cleanse(text) rating = e.find_element_by_css_selector( 'span.number_grade').text db.insert('crawled_data', task_id=task_id, channel=channel, keyword=keyword, num=num, post_date=post_date, title=productName, text=text, author=author, url=productURL, etc1=rating, etc2=option) except Exception as ex: print(ex) #pagenation #현재 페이지 pageNum = int( driver.driver.find_element_by_css_selector( "nav._review_list_page a[aria-selected='true']").text) print(pageNum) nextPage = pageNum + 1 if nextPage > maxPage: #다음페이지목록 driver.driver.find_element_by_xpath( "//*[contains(@class,'module_pagination')]//a[contains(@class,'next')]" ).click() maxPage = pageNum + 10 else: #다음페이지 driver.driver.find_element_by_xpath( "//*[contains(@class,'module_pagination')]//*[text()=%d]" % (nextPage)).click()