Example #1
0
def crawl_auction(songID):
    try:
        print(songID)
        songID = str(songID)
        db = Sql("khroma")
        url = "https://www.musicow.com/auction/%s" % (songID)
        CUSTOM_HEADER['referer'] = url
        r = requests.get(url, headers=CUSTOM_HEADER)
        bs = BeautifulSoup(r.text, 'html.parser')

        text_info = bs.select_one('#tab1').script.text.split(";")
        profit_raw = re.sub("[A-z=\s]", "", text_info[2])
        profit_info = re.sub(".+'", "", profit_raw)
        print(profit_info)
        auction = bs.select('dl.price strong')
        auctionAmount = int(re.sub("\D", "", auction[1].text))
        auctionStartPrice = int(re.sub("\D", "", auction[2].text))
        #auctionLowPrice = int(re.sub("\D", "", auction[2].text))
        #auctionAvgPrice = int(re.sub("\D", "", auction[3].text))
        print(auctionStartPrice)
        info_list = bs.select('div.lst_bul p')
        share_raw = re.sub("\s", "", info_list[0].text)
        shares = int(share_raw.replace("1/", "").replace(",", ""))
        print(shares)
        db.insert_withoutDuplication('musicow_auction',
                                     check_list=['songID'],
                                     songID=songID,
                                     profit_info=profit_info,
                                     shares=shares,
                                     auctionAmount=auctionAmount,
                                     auctionStartPrice=auctionStartPrice)
    except Exception as ex:
        print(ex)
Example #2
0
def crawl_deal():
    db = Sql("khroma")
    page = 1
    num = 0
    while True:
        print("페이지 : ", page)

        url_deal = "https://www.musicow.com/auctions?tab=market&keyword=&sortorder=&page=%d" % (
            page)
        CUSTOM_HEADER['referer'] = url_deal
        r1 = requests.get(url_deal, headers=CUSTOM_HEADER)
        bs1 = BeautifulSoup(r1.text, 'html.parser')

        song_list = bs1.select('#list li')
        for song in song_list:
            num += 1
            url2_add = song.a["href"]
            dealID = url2_add.split("/")[2].strip()
            print(dealID)
            txt = song.select('div.txt dl')
            title = txt[0].dd.text
            singer = txt[1].dd.text
            currentPrice = int(re.sub("\D", "", txt[2].dd.text))

            update = db.insert('musicow_deal',
                               dealID=dealID,
                               title=title,
                               singer=singer,
                               currentPrice=currentPrice,
                               num_order=num)
        page += 1
        if len(song_list) == 0:
            break
Example #3
0
def crawl_list():
    db = Sql("khroma")
    page = 1
    while True:
        print("페이지 : ", page)

        url_list = "https://www.musicow.com/auctions?tab=closed&keyword=&page=" + str(
            page)
        CUSTOM_HEADER['referer'] = url_list
        r1 = requests.get(url_list, headers=CUSTOM_HEADER)
        bs1 = BeautifulSoup(r1.text, 'html.parser')
        song_list = bs1.select('ul.user_buy li')
        for song in song_list:
            url2_add = song.a["href"]
            songID = url2_add.split("/")[2].strip()
            print(songID)
            txt = song.select('div.txt dl')
            title = txt[0].dd.text
            singer = txt[1].dd.text
            auctionDate = txt[2].dd.text

            db.insert_withoutDuplication('musicow_list',
                                         check_list=['songID'],
                                         songID=songID,
                                         title=title,
                                         singer=singer,
                                         auctionDate=auctionDate)
            crawl_auction(songID)
        page += 1
        if len(song_list) == 0:
            break
Example #4
0
def update_info():
    db = Sql("khroma")
    dealID_data = db.select("musicow_deal", "dealID")
    dealID_list = set([d["dealID"] for d in dealID_data])
    for dealID in dealID_list:
        try:
            print(dealID)
            url = "https://www.musicow.com/song/%s?tab=info" % (dealID)
            CUSTOM_HEADER['referer'] = url
            r = requests.get(url, headers=CUSTOM_HEADER)
            bs = BeautifulSoup(r.text, 'html.parser')
            title = bs.select_one('strong.song_title').text.strip()
            singer = bs.select_one('span.artist').text.strip()
            auction = bs.select('div.row-col-2 dd')
            auctionAmount = int(re.sub("\D", "", auction[0].text))
            auctionStartPrice = int(re.sub("\D", "", auction[1].text))
            auctionLowPrice = int(re.sub("\D", "", auction[2].text))
            auctionAvgPrice = int(re.sub("\D", "", auction[3].text))

            db.insert_withoutDuplication('musicow_info',
                                         check_list=['dealID'],
                                         dealID=dealID,
                                         title=title,
                                         singer=singer,
                                         auctionAmount1=auctionAmount,
                                         auctionStartPrice1=auctionStartPrice,
                                         auctionLowPrice1=auctionLowPrice,
                                         auctionAvgPrice1=auctionAvgPrice)
        except Exception as ex:
            print(ex)
Example #5
0
class Data:
    def __init__(self, dbName):
        self.db = Sql(dbName)
        self.df = pd.DataFrame()

    def drop_duplicates(self, subset):
        self.df = self.df.drop_duplicates(subset=subset)

    def shape(self):
        return self.df.shape

    def setDB(
        self,
        dbName,
    ):
        pass

    def get_df(self, *colnames, by_sentence_textColname=None):
        '''

        :param colnames: 행이름 str
        :param by_sentence_textColname: 문장 분해 대상 text 행이름
        :return: DataFrame
        '''
        df_documents = self.df.loc[:, list(colnames)]
        if by_sentence_textColname:
            df_sentences = pd.DataFrame()
            nrows = df_documents.shape[0]
            for i in tqdm(range(nrows), "loader : Getting Sentences "):
                row = df_documents.iloc[i]
                text = row[by_sentence_textColname]
                if len(text) > 0:
                    text = cleanse_text(text)
                    sentences = kss.split_sentences(
                        text)  #텍스트 길이 300 넘는게 허다하게 나옴... 체크 필요함
                    for s in sentences:
                        s = cleanse_sentence(s)
                        if len(s) > 0:
                            row_temp = row.copy()
                            row_temp[by_sentence_textColname] = s
                            df_sentences = df_sentences.append(row_temp)
                else:
                    continue
            print(
                f"loader : Getting DataFrame Done {nrows} Documents to {df_sentences.shape[0]} Sentences"
            )
            return df_sentences
        else:
            return df_documents

    def _getSentence_(self, text_seq):
        pass

    def addData(self,
                channel,
                keyword,
                fromDate,
                toDate,
                tablename,
                dbfnameChannel,
                dbfnameKeyword,
                dbfnamePostDate,
                drop_duplicate_by=None):
        '''
        :param channel: str
        :param keyword: str
        :param fromDate: 'yyyy-mm-dd'
        :param toDate: 'yyyy-mm-dd'
        :param tablename: str
        :param drop_duplicate_by: 중복 제거 행이름 list. e.g.['keyword', 'url']
        :return:
        '''
        nrows0 = self.df.shape[0]
        ldf = self._load_(channel, keyword, fromDate, toDate, tablename,
                          dbfnameChannel, dbfnameKeyword, dbfnamePostDate)
        print(ldf)
        nrowsldf = ldf.shape[0]

        self.df = self.df.append(ldf)
        addednRows = nrowsldf
        droppednRows = 0

        if drop_duplicate_by:
            self.df.drop_duplicates(subset=drop_duplicate_by)
            addednRows = self.df.shape[0] - nrows0
            droppednRows = nrowsldf - addednRows
        print(
            f'addData : added {addednRows} rows (dropped {droppednRows} rows)')

    def _load_(self, channel, keyword, fromDate, toDate, tablename,
               dbfnameChannel, dbfnameKeyword, dbfnamePostDate):
        where_str = f"{dbfnameKeyword}='{keyword}' and {dbfnameChannel}='{channel}' and {dbfnamePostDate} between '{fromDate}' and '{toDate}'"
        df = self.db.select(tablename, "*", where_str, asDataFrame=True)
        return df
Example #6
0
 def __init__(self, dbName):
     self.db = Sql(dbName)
     self.df = pd.DataFrame()
Example #7
0
def crawl(keyword, productURL, productName, comment="navershopping"):
    #db명
    db = Sql('dalmaden')
    task_id = db.insert('task_log', comment=comment)
    driver = SeleniumDriver()

    driver.get(productURL)

    #직접 스크롤 다운해서 리뷰 페이지 표출
    input("직접 스크롤 다운해서 리뷰 페이지 표출")
    #아래 코드 실행
    maxPage = 10
    num = 0
    while True:
        #getcontents
        #리뷰 컨테이너
        ele = driver.driver.find_element_by_css_selector(
            "#area_review_list .detail_list_review")
        ele.text
        #리뷰 리스트
        ele = ele.find_elements_by_css_selector('li')
        for e in ele:

            print("################################")
            try:
                num += 1
                print(e.text)
                channel = 'navershopping'
                text_info = e.find_elements_by_css_selector(
                    'div.area_status_user span')
                author, date_raw, option = "", "", ""
                try:
                    author = text_info[0].text
                    date_raw = '20' + text_info[1].text
                    date_lst = date_raw.split(".")
                    post_date = "-".join(date_lst[:-1])
                    option = e.find_element_by_css_selector(
                        'p.text_option').text
                except:
                    pass
                text = e.find_element_by_css_selector('span.text').text
                text = cleanse(text)
                rating = e.find_element_by_css_selector(
                    'span.number_grade').text
                db.insert('crawled_data',
                          task_id=task_id,
                          channel=channel,
                          keyword=keyword,
                          num=num,
                          post_date=post_date,
                          title=productName,
                          text=text,
                          author=author,
                          url=productURL,
                          etc1=rating,
                          etc2=option)
            except Exception as ex:
                print(ex)

        #pagenation
        #현재 페이지
        pageNum = int(
            driver.driver.find_element_by_css_selector(
                "nav._review_list_page a[aria-selected='true']").text)
        print(pageNum)
        nextPage = pageNum + 1
        if nextPage > maxPage:
            #다음페이지목록
            driver.driver.find_element_by_xpath(
                "//*[contains(@class,'module_pagination')]//a[contains(@class,'next')]"
            ).click()
            maxPage = pageNum + 10
        else:
            #다음페이지
            driver.driver.find_element_by_xpath(
                "//*[contains(@class,'module_pagination')]//*[text()=%d]" %
                (nextPage)).click()