class UserwiseDivergenceAnalysis: def __init__(self, dataFile, userJoins): sys.stderr.write('In Constructor\n') self.distComparer = DistComparer() self.dataHandler = DataHandler(dataFile, userJoins) self.dataHandler.loadActiveForums() self.__loadData() self.sampledUsers = set() def __loadData(self): stopWords = set([s.strip() for s in open("/usr0/home/pgadde/Work/Ethnic/Hoodup/DataExploration/SampledPosts/Regression/stopWords")]) self.dataHandler.preprocessVocab(stopWords) def sampleUsers(self): #self.dataHandler.userStats(outFile) self.sampledUsers = self.dataHandler.sampleUsers() def doDivergenceAnalysisPerUser(self, outFile): outFile = open(outFile,'w') for user in self.sampledUsers: #print "User:"******"Month:",month outFile.write(str(user)+'\t'+str(month)+'\t'+str(userDivergences[month][0])+'\t'+str(userDivergences[month][1])+'\n') outFile.close() def prepareUserDivergencesActive(self, userNum): divergences = {} userMonths = self.dataHandler.getUserMonths(userNum) activeForum = self.dataHandler.getActiveForum(userNum) if activeForum.find("Talk")<0: return -1 userInitialData = self.dataHandler.makeDist(self.dataHandler.getForumInitialData(self.dataHandler.getActiveForum(userNum))) userMaturedData = self.dataHandler.makeDist(self.dataHandler.getForumMaturedData(self.dataHandler.getActiveForum(userNum))) for userMonth in userMonths: monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth)) divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData)) return divergences def prepareUserDivergencesBackground(self, userNum): divergences = {} userMonths = self.__dataHandler.getUserMonths(userNum) userInitialData = self.dataHandler.makeDist(self.dataHandler.getForumInitialData("AllTalk")) userMaturedData = self.dataHandler.makeDist(self.dataHandler.getForumMaturedData("AllTalk")) for userMonth in userMonths: monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth)) divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData)) return divergences def prepareUserDivergences(self, userNum): divergences = {} userMonths = self.dataHandler.getUserMonths(userNum) userInitialData = self.dataHandler.makeDist(self.dataHandler.getUserInitialData(userNum)) userMaturedData = self.dataHandler.makeDist(self.dataHandler.getUserMaturedData(userNum)) for userMonth in userMonths: monthData = self.dataHandler.makeDist(self.dataHandler.getUserDataForDivergence(userNum, userMonth)) divergences[userMonth] = (self.distComparer.jsDivergence(userInitialData,monthData), self.distComparer.jsDivergence(monthData, userMaturedData)) return divergences