Exemple #1
0
def update_info():
    db = Sql("khroma")
    dealID_data = db.select("musicow_deal", "dealID")
    dealID_list = set([d["dealID"] for d in dealID_data])
    for dealID in dealID_list:
        try:
            print(dealID)
            url = "https://www.musicow.com/song/%s?tab=info" % (dealID)
            CUSTOM_HEADER['referer'] = url
            r = requests.get(url, headers=CUSTOM_HEADER)
            bs = BeautifulSoup(r.text, 'html.parser')
            title = bs.select_one('strong.song_title').text.strip()
            singer = bs.select_one('span.artist').text.strip()
            auction = bs.select('div.row-col-2 dd')
            auctionAmount = int(re.sub("\D", "", auction[0].text))
            auctionStartPrice = int(re.sub("\D", "", auction[1].text))
            auctionLowPrice = int(re.sub("\D", "", auction[2].text))
            auctionAvgPrice = int(re.sub("\D", "", auction[3].text))

            db.insert_withoutDuplication('musicow_info',
                                         check_list=['dealID'],
                                         dealID=dealID,
                                         title=title,
                                         singer=singer,
                                         auctionAmount1=auctionAmount,
                                         auctionStartPrice1=auctionStartPrice,
                                         auctionLowPrice1=auctionLowPrice,
                                         auctionAvgPrice1=auctionAvgPrice)
        except Exception as ex:
            print(ex)
Exemple #2
0
class Data:
    def __init__(self, dbName):
        self.db = Sql(dbName)
        self.df = pd.DataFrame()

    def drop_duplicates(self, subset):
        self.df = self.df.drop_duplicates(subset=subset)

    def shape(self):
        return self.df.shape

    def setDB(
        self,
        dbName,
    ):
        pass

    def get_df(self, *colnames, by_sentence_textColname=None):
        '''

        :param colnames: 행이름 str
        :param by_sentence_textColname: 문장 분해 대상 text 행이름
        :return: DataFrame
        '''
        df_documents = self.df.loc[:, list(colnames)]
        if by_sentence_textColname:
            df_sentences = pd.DataFrame()
            nrows = df_documents.shape[0]
            for i in tqdm(range(nrows), "loader : Getting Sentences "):
                row = df_documents.iloc[i]
                text = row[by_sentence_textColname]
                if len(text) > 0:
                    text = cleanse_text(text)
                    sentences = kss.split_sentences(
                        text)  #텍스트 길이 300 넘는게 허다하게 나옴... 체크 필요함
                    for s in sentences:
                        s = cleanse_sentence(s)
                        if len(s) > 0:
                            row_temp = row.copy()
                            row_temp[by_sentence_textColname] = s
                            df_sentences = df_sentences.append(row_temp)
                else:
                    continue
            print(
                f"loader : Getting DataFrame Done {nrows} Documents to {df_sentences.shape[0]} Sentences"
            )
            return df_sentences
        else:
            return df_documents

    def _getSentence_(self, text_seq):
        pass

    def addData(self,
                channel,
                keyword,
                fromDate,
                toDate,
                tablename,
                dbfnameChannel,
                dbfnameKeyword,
                dbfnamePostDate,
                drop_duplicate_by=None):
        '''
        :param channel: str
        :param keyword: str
        :param fromDate: 'yyyy-mm-dd'
        :param toDate: 'yyyy-mm-dd'
        :param tablename: str
        :param drop_duplicate_by: 중복 제거 행이름 list. e.g.['keyword', 'url']
        :return:
        '''
        nrows0 = self.df.shape[0]
        ldf = self._load_(channel, keyword, fromDate, toDate, tablename,
                          dbfnameChannel, dbfnameKeyword, dbfnamePostDate)
        print(ldf)
        nrowsldf = ldf.shape[0]

        self.df = self.df.append(ldf)
        addednRows = nrowsldf
        droppednRows = 0

        if drop_duplicate_by:
            self.df.drop_duplicates(subset=drop_duplicate_by)
            addednRows = self.df.shape[0] - nrows0
            droppednRows = nrowsldf - addednRows
        print(
            f'addData : added {addednRows} rows (dropped {droppednRows} rows)')

    def _load_(self, channel, keyword, fromDate, toDate, tablename,
               dbfnameChannel, dbfnameKeyword, dbfnamePostDate):
        where_str = f"{dbfnameKeyword}='{keyword}' and {dbfnameChannel}='{channel}' and {dbfnamePostDate} between '{fromDate}' and '{toDate}'"
        df = self.db.select(tablename, "*", where_str, asDataFrame=True)
        return df