def preLda(vCid): bList = xmlDL.migrateBD(vCid) # eg: uniBContentList: ['user', ['b1', 'b2', 'b3']] # eg: uniBContentList2: ['user1', '157.430', 'b1'], ['user1', '159.430', 'b2'].... # eg: uniBContentListString: ['22ccd704', '\xe7\x88\xb7\xe7\x88\xb7QAQ'] uniBContentList, uniBContentList2, uniBContentListString, bNumPerUser = getUserAndBarrageList( bList) # list of a user's total barrage # newUniBContentListString = removeBLessNum(uniBContentListString, bNumPerUser, 3) # list of a user's one barrage newUniBContentListString = removeBLessNum(uniBContentList2, bNumPerUser, 3) bContentList = xmlDL.divideSent(newUniBContentListString, 1) # remove barrage comments which have no N or A print len(bContentList), len(newUniBContentListString) for bContentI in range(len(bContentList)): try: if len(bContentList[bContentI]) == 0: # bContentList.remove(bContentList[bContentI]) # newUniBContentListString.remove(newUniBContentListString[bContentI]) del bContentList[bContentI] del newUniBContentListString[bContentI] except: continue print len(bContentList), len(newUniBContentListString) print '---newUniBContentListString', newUniBContentListString[3][ 0], newUniBContentListString[10][1], newUniBContentListString[10][2] # print '---bContentList', bContentList[3], bContentList[10] # sys.exit() # 276088 words tatally # 17941 unique words # dictionary of all words dic = corpora.Dictionary(bContentList) print type(dic) # for word, index in dic.token2id.iteritems(): # word = word.encode('utf-8') # print word, index print 'dictionary number of docs, num_pos, number of terms: ', dic.num_docs, dic.num_pos, len( dic) # text corpus corpus = [dic.doc2bow(text) for text in bContentList] # print max(corpus) tfidf = models.TfidfModel(corpus) print tfidf, type(tfidf) example = [(0, 1), (2, 1)] print tfidf[example] for word, index in dic.token2id.iteritems(): if index == 4 or index == 0: word = word.encode('utf-8') print word, index return dic, corpus, tfidf, bContentList, newUniBContentListString
def getScoreListFromVCid3(vCid): scoreList3 = [] # userList whose barrage is larger than 3 userList = getUserList(3) bList = xmlDL.migrateBD(vCid) for b in bList: word_list = jieba.lcut(b[8], cut_all=False) a = sentimentAnalysis.getScore(word_list) a = (float(b[0]), b[6]) + a + (b[8], 0) scoreList3.append(a) return scoreList3
def getUserList(num): userList = [] bList = xmlDL.migrateBD(xmlDL.vCid) uniBContentList, uniBContentList2, uniBContentListString, bNumPerUser = lda.getUserAndBarrageList(bList) for i in range(len(bNumPerUser)): # if bNumPerUser[i][1] == 1: # numOfUsersList[0] += 1 # elif bNumPerUser[i][1] == 2: # numOfUsersList[1] += 1 # elif bNumPerUser[i][1] == 3: # numOfUsersList[2] += 1 if bNumPerUser[i][1] >= num: userList.append(bNumPerUser[i][0]) return userList
def writeUIdList(vCid, filePath): userList = xmlDL.migrateBD(vCid)[:, 6] uniUsers = np.unique(userList) print '---------totally', len(uniUsers), 'unique users in', vCid # print hashTrans(uniUsers[12]) if not os.path.exists(filePath): os.mkdir(filePath) fo = open(filePath + 'userIdList.txt', "w+") for i in range(len(uniUsers)): transUserId = hashTrans(uniUsers[i]) print transUserId fo.write(uniUsers[i] + ',' + str(transUserId)) fo.write('\n') print '-----------all userIdList of', vCid, 'is downloaded to local'
# userList whose barrage is larger than 3 bList = newUniBContentListString for b in bList: word_list = jieba.lcut(b[1], cut_all=False) a = sentimentAnalysis.getScore(word_list) a = (b[0], b[1]) + a scoreList2.append(a) return scoreList2 scoreList = [] posScoreList = [] negScoreList = [] if __name__ == '__main__': bList = xmlDL.migrateBD(xmlDL.vCid) # eg: uniBContentListString: ['22ccd704', '\xe7\x88\xb7\xe7\x88\xb7QAQ'] uniBContentList, uniBContentList2, uniBContentListString, bNumPerUser = lda.getUserAndBarrageList( bList) print 'uni', len(uniBContentListString) newUniBContentListString = lda.removeBLessNum(uniBContentListString, bNumPerUser, 3) print 'new', len(newUniBContentListString) scoreList2 = getScoreListFromVCid(newUniBContentListString) # for i in scoreList2: # for j in i: # print j, # print