def extractFeatureToDB(self, beginWeek, endWeek=datetime.today(), isReload=False, useAlchemyAPI=False): if beginWeek < datetime(2007, 1, 7) or endWeek > datetime.today(): raise Exception('Invalid input date!') beginWeek, endWeek = dateToSaturday(beginWeek), dateToSaturday(endWeek) endWeek = endWeek - timedelta(days=7) if endWeek > datetime.today() else endWeek iterWeek = beginWeek db = DBController() while iterWeek <= endWeek: lastWeek = iterWeek - timedelta(days=7) songRankList = db.getSongIdListByWeek(lastWeek) for songId in songRankList: if isReload == False and db.isFeatureInDB(iterWeek, songId): continue featureDict = {} featureDict['id'] = songId featureDict['week'] = iterWeek featureDict['sales'] = db.getSalesRank(lastWeek, songId) featureDict['radio'] = db.getRadioRank(lastWeek, songId) featureDict['streaming'] = db.getStreamingRank(lastWeek, songId) featureDict['MVView'], featureDict['MVSocialInteraction'] = db.getIMVDBData(iterWeek, songId) featureDict['MTVReviewCount'], featureDict['MTVReviewScore'] = db.getMTVReviewData(iterWeek, songId, useAlchemyAPI) featureDict['youtubeCommentCount'], featureDict['youtubeCommentScore'] = db.getYoutubeData(iterWeek, songId, useAlchemyAPI) featureDict['twitterCount'], featureDict['twitterScore'] = db.getTwitterData(iterWeek, songId, useAlchemyAPI) featureDict['rank'] = db.getTop50Rank(iterWeek, songId) db.insertFeatureToDB(featureDict) iterWeek += timedelta(days=7)
def train(self, beginWeek, endWeek, featureMode=0, regressionModelType=0): if beginWeek < datetime(2007, 1, 1) or endWeek > datetime.today(): raise Exception("Invalid input date!") beginWeek, endWeek = dateToSaturday(beginWeek), dateToSaturday(endWeek) endWeek = endWeek - timedelta(days=7) if endWeek > datetime.today() else endWeek iterWeek = beginWeek fg = FeatureGenerator() regression = self.getRegressionModel(regressionModelType) while iterWeek <= endWeek: matrix_train = fg.getFeatureMatrix(iterWeek, iterWeek, featureMode) X_train, y_train = matrix_train[:, 0:-1], matrix_train[:, -1] regression.fit(X_train, y_train) iterWeek += timedelta(weeks=1) return regression
def extractSalesRankToDB(self, beginDate=datetime.today(), endDate=datetime.today()): if beginDate < datetime(2007, 1, 1) or endDate > datetime.today(): raise Exception('Invalid input date!') beginDate = dateToSaturday(beginDate) endDate = dateToSaturday(endDate) endDate = endDate - timedelta(days=7) if endDate > datetime.today() else endDate iterDate = beginDate db = DBController() while iterDate <= endDate: if db.checkSalesRankExistInDB(iterDate): iterDate = iterDate + timedelta(days = 7) continue URL = self.getURL(iterDate) chart = self.getSalesChartFromURL(URL) db.insertSalesChartToDB(iterDate, chart) iterDate = iterDate + timedelta(days = 7)
def filterDataByWeek(self, rawDataList): #loop item, if equals that week's Saturday, add it to statDataList statDataList = [] for date, data in rawDataList: if date == dateToSaturday(date): statDataList.append({'week' : date, 'count' : data}) return statDataList
def getFeatureMatrix(self, beginWeek, endWeek=datetime.today(), mode=0, withSongId=False): if beginWeek < datetime(2007, 1, 1) or endWeek > datetime.today(): raise Exception('Invalid input date!') beginWeek, endWeek = dateToSaturday(beginWeek), dateToSaturday(endWeek) endWeek = endWeek - timedelta(days=7) if endWeek > datetime.today() else endWeek iterWeek = beginWeek db = DBController() matrix = [] while iterWeek <= endWeek: featureList = db.getFeatureListByWeek(iterWeek) for featureDict in featureList: featureVector = self.featureDictToList(featureDict, mode, withSongId) if featureVector is None: continue else: matrix.append(featureVector) iterWeek += timedelta(weeks=1) matrix = numpy.matrix(matrix) return matrix
def insertTop50ChartToDB(self, week, chart): #week is a datetime object, except for date part, other must be zero week = dateToSaturday(week) self.db.top50.remove({'week' : week}) songList = [] for _, title, artist in chart: songId = self.getSongId(title, artist) if songId is None: songId = self.insertSong(title, artist) songList.append(songId) self.db.top50.insert({'week' : week, 'rank' : songList})
def insertSalesChartToDB(self, week, chart): week = dateToSaturday(week) self.db.sales.remove({'week' : week}) songList = [] for title, artist, _, _, _, _ in chart: songId = self.getSongId(title, artist) if songId is None: songId = self.getSongId(title) if songId is None: #only insert new song from Billboard chart songId = None songList.append(songId) self.db.sales.insert({'week' : week, 'rank' : songList})
def extractDetailStatData(self, tables, URL): detailStatDict = {'week' : dateToSaturday(datetime.today()), 'URL' : URL} for table in tables: tableText = cleanUnicode(table.text) if tableText.find('Views') != -1: detailStatDict['MVViewCount'] = self.getDetailStatTableData(tableText, 'Views') detailStatDict['MVCommentCount'] = self.getDetailStatTableData(tableText, 'Comments') else: detailStatDict['FBLikeCount'] = self.getDetailStatTableData(tableText, 'Facebook Like Count') detailStatDict['FBShareCount'] = self.getDetailStatTableData(tableText, 'Facebook Share Count') detailStatDict['FBCommentCount'] = self.getDetailStatTableData(tableText, 'Facebook Comment Count') detailStatDict['TwitterCount'] = self.getDetailStatTableData(tableText, 'Twitter') detailStatDict['GooglePlusCount'] = self.getDetailStatTableData(tableText, 'GooglePlusOne') return detailStatDict
def checkSalesRankExistInDB(self, date): chart = self.db.sales.find_one({'week' : dateToSaturday(date)}) return False if chart is None or len(chart['rank']) != 40 else True
def checkTop50ExistInDB(self, date): chart = self.db.top50.find_one({'week' : dateToSaturday(date)}) if chart is None or len(chart['rank']) != 50: return False else: return True