def extractFeatureToDB(self, beginWeek, endWeek=datetime.today(), isReload=False, useAlchemyAPI=False):
     if beginWeek < datetime(2007, 1, 7) or endWeek > datetime.today():
         raise Exception('Invalid input date!')
     beginWeek, endWeek = dateToSaturday(beginWeek), dateToSaturday(endWeek)
     endWeek = endWeek - timedelta(days=7) if endWeek > datetime.today() else endWeek
     iterWeek = beginWeek
     db = DBController()
     while iterWeek <= endWeek:
         lastWeek = iterWeek - timedelta(days=7)
         songRankList = db.getSongIdListByWeek(lastWeek)
         for songId in songRankList:
             if isReload == False and db.isFeatureInDB(iterWeek, songId):
                 continue
             featureDict = {}
             featureDict['id'] = songId
             featureDict['week'] = iterWeek
             featureDict['sales'] = db.getSalesRank(lastWeek, songId)
             featureDict['radio'] = db.getRadioRank(lastWeek, songId)
             featureDict['streaming'] = db.getStreamingRank(lastWeek, songId)
             featureDict['MVView'], featureDict['MVSocialInteraction'] = db.getIMVDBData(iterWeek, songId)
             featureDict['MTVReviewCount'], featureDict['MTVReviewScore'] = db.getMTVReviewData(iterWeek, songId, useAlchemyAPI) 
             featureDict['youtubeCommentCount'], featureDict['youtubeCommentScore'] = db.getYoutubeData(iterWeek, songId, useAlchemyAPI)
             featureDict['twitterCount'], featureDict['twitterScore'] = db.getTwitterData(iterWeek, songId, useAlchemyAPI)
             featureDict['rank'] = db.getTop50Rank(iterWeek, songId)
             db.insertFeatureToDB(featureDict)
         iterWeek += timedelta(days=7)
 def train(self, beginWeek, endWeek, featureMode=0, regressionModelType=0):
     if beginWeek < datetime(2007, 1, 1) or endWeek > datetime.today():
         raise Exception("Invalid input date!")
     beginWeek, endWeek = dateToSaturday(beginWeek), dateToSaturday(endWeek)
     endWeek = endWeek - timedelta(days=7) if endWeek > datetime.today() else endWeek
     iterWeek = beginWeek
     fg = FeatureGenerator()
     regression = self.getRegressionModel(regressionModelType)
     while iterWeek <= endWeek:
         matrix_train = fg.getFeatureMatrix(iterWeek, iterWeek, featureMode)
         X_train, y_train = matrix_train[:, 0:-1], matrix_train[:, -1]
         regression.fit(X_train, y_train)
         iterWeek += timedelta(weeks=1)
     return regression
 def extractSalesRankToDB(self, beginDate=datetime.today(), endDate=datetime.today()):
     if beginDate < datetime(2007, 1, 1) or endDate > datetime.today():
         raise Exception('Invalid input date!')
     beginDate = dateToSaturday(beginDate)
     endDate = dateToSaturday(endDate)
     endDate = endDate - timedelta(days=7) if endDate > datetime.today() else endDate
     iterDate = beginDate
     db = DBController()
     while iterDate <= endDate:
         if db.checkSalesRankExistInDB(iterDate):
             iterDate = iterDate + timedelta(days = 7)
             continue
         URL = self.getURL(iterDate)
         chart = self.getSalesChartFromURL(URL)
         db.insertSalesChartToDB(iterDate, chart)
         iterDate = iterDate + timedelta(days = 7)
 def filterDataByWeek(self, rawDataList):
     #loop item, if equals that week's Saturday, add it to statDataList
     statDataList = []
     for date, data in rawDataList:
         if date == dateToSaturday(date):
             statDataList.append({'week' : date, 'count' : data})
     return statDataList
 def getFeatureMatrix(self, beginWeek, endWeek=datetime.today(), mode=0, withSongId=False):
     if beginWeek < datetime(2007, 1, 1) or endWeek > datetime.today():
         raise Exception('Invalid input date!')
     beginWeek, endWeek = dateToSaturday(beginWeek), dateToSaturday(endWeek)
     endWeek = endWeek - timedelta(days=7) if endWeek > datetime.today() else endWeek
     iterWeek = beginWeek
     db = DBController()
     matrix = []
     while iterWeek <= endWeek:
         featureList = db.getFeatureListByWeek(iterWeek)
         for featureDict in featureList:
             featureVector = self.featureDictToList(featureDict, mode, withSongId)
             if featureVector is None:
                 continue
             else:
                 matrix.append(featureVector)
         iterWeek += timedelta(weeks=1)
     matrix = numpy.matrix(matrix)
     return matrix 
 def insertTop50ChartToDB(self, week, chart):
     #week is a datetime object, except for date part, other must be zero
     week = dateToSaturday(week) 
     self.db.top50.remove({'week' : week})
     songList = []
     for _, title, artist in chart:
         songId = self.getSongId(title, artist)
         if songId is None:
             songId = self.insertSong(title, artist)
         songList.append(songId)
     self.db.top50.insert({'week' : week, 'rank' : songList})
 def insertSalesChartToDB(self, week, chart):
     week = dateToSaturday(week)
     self.db.sales.remove({'week' : week})
     songList = []
     for title, artist, _, _, _, _ in chart:
         songId = self.getSongId(title, artist)
         if songId is None:
             songId = self.getSongId(title)
             if songId is None:
                 #only insert new song from Billboard chart
                 songId = None
         songList.append(songId)
     self.db.sales.insert({'week' : week, 'rank' : songList})
 def extractDetailStatData(self, tables, URL):
     detailStatDict = {'week' : dateToSaturday(datetime.today()), 'URL' : URL}
     for table in tables:
         tableText = cleanUnicode(table.text)
         if tableText.find('Views') != -1:
             detailStatDict['MVViewCount'] = self.getDetailStatTableData(tableText, 'Views')
             detailStatDict['MVCommentCount'] = self.getDetailStatTableData(tableText, 'Comments')
         else:
             detailStatDict['FBLikeCount'] = self.getDetailStatTableData(tableText, 'Facebook Like Count')
             detailStatDict['FBShareCount'] = self.getDetailStatTableData(tableText, 'Facebook Share Count')
             detailStatDict['FBCommentCount'] = self.getDetailStatTableData(tableText, 'Facebook Comment Count')
             detailStatDict['TwitterCount'] = self.getDetailStatTableData(tableText, 'Twitter')
             detailStatDict['GooglePlusCount'] = self.getDetailStatTableData(tableText, 'GooglePlusOne')
     return detailStatDict
 def checkSalesRankExistInDB(self, date):
     chart = self.db.sales.find_one({'week' : dateToSaturday(date)})
     return False if chart is None or len(chart['rank']) != 40 else True
 def checkTop50ExistInDB(self, date):
     chart = self.db.top50.find_one({'week' : dateToSaturday(date)})
     if chart is None or len(chart['rank']) != 50:
         return False
     else:
         return True