def __preProcess(): dts.loop_with_param(__readin, tweets, 'loading files') for tweet in tweets: #dts.writeO( tmp + '\n' ) tmp = {u'text': tweet} dts.writeO(json.dumps(tmp) + '\n') setlen = len(tweets) print '%d tweets remaining' % setlen dts.writeL('%d tweets remaining' % setlen)
def featureVectorParse(): dts.setSize(10000) dts.setFile('../data/featvect', '../emojiOutput/featureWang10000_no01', '../log/featureWang') dts.openFiles() dts.loop(__lineParse, 'parse featvect') dts.writeL(str(name_dict)) dts.closeFiles()
def divideHashtag(): dts.setSize(1000000) dts.setFile('../hashOutput/afterPre.txt', '../hashOutput/divideHashtag.txt', '../log/divideHashtag.log') dts.openFiles() dts.loop(__divide, 'divide by Hashtag') for emo in EmoList: print 'label %d \t: %d' % (emo['label'], emo['cnt']) dts.writeL('label %d \t: %d\n' % (emo['label'], emo['cnt'])) dts.closeFiles()
def parse_line(): line = dts.readlineI() if not line: return try: obj = json.loads(line) except: return label = obj[u'label'] text = obj[u'text'] global PC if int(label) == 1: PC += 1 output = '%d %s\n' % (label, gen_Feature(text)) dts.writeO(output) dts.writeL(str(label) + '\n')
def make_bigram(): dict = {} dts.loop_with_param_clean(__bigram, __bigram_clean, dict, 'Make the dict of bigram') print 'start to output' cntList = {} for x in range(100000): cntList.update({x: 0}) for k, v in dict.iteritems(): if v > 10: dts.writeO(k + ':' + str(v) + '\n') if v >= 100000: cntList.update({100000: cntList.get(100000, 0)}) else: cntList.update({v: cntList.get(v, 0) + 1}) for k, v in cntList.iteritems(): dts.writeL(str(k) + ':' + str(v) + '\n')
def labelCounter(): dts.setSize(100000) dts.setFile('../data/featvect', '', '../log/featvectLabelCount') dts.openFiles() global counter for x in range(9): counter[x] = 0 dts.loop(__line, 'parse featvect') sum = 0 for x in range(9): sum += counter[x] for x in range(9): print 'Label\t%d\t:%d (%.2f%%)' % ( x, counter[x], float(counter[x] * 100.0) / float(sum)) dts.writeL('Label\t%d\t:%d (%.2f%%)\n' % (x, counter[x], float(counter[x] * 100.0) / float(sum))) print 'Sum\t\t:%d' % sum dts.closeFiles()
def make_dict(): dts.setFile('../output/afterPre.txt', '../output/Dict_raw.txt', '../log/idf.log') dts.setSize(25770000) dts.openFiles() dict = {} dts.loop_with_param(__calcIDF, dict, 'calc the Idf') print 'start sort and print' cnt = 0 pcnt = 0 CntDistribution = {} CNT_MAX = 1000000 for x in range(CNT_MAX + 1): CntDistribution[x] = 0 for key, value in [(k, dict[k]) for k in sorted(dict.keys())]: if value > 10 and value < 364600: dts.writeO('%s:%d\n' % (key, value)) pcnt += 1 cnt += 1 if (value > 364600): print key if (value > CNT_MAX * 10): CntDistribution[CNT_MAX] += 1 else: CntDistribution[value / 10] += 1 print '%d words output' % pcnt dts.writeL('%d words output\n' % pcnt) print 'printing range log' ncnt = 0 for x in range(CNT_MAX): ncnt += CntDistribution[x] dts.writeL('%7d~%7d:\t%d\n' % (x * 10, (x + 1) * 10, cnt - ncnt)) dts.closeFiles()
if flag != -1 : emo['fileptr'].write( line ) break if flag >= 0: emo['cnt'] = emo['cnt'] + 1 dts.loop( dealLine, 'check Emoticons' ) for emo in Emotions: emo['fileptr'].close() print '=============' print 'processed Tweets:' + str( dts.processSize ) for emo in Emotions: print emo['filename'] + ':' + str( emo['cnt'] ) dts.writeL( emo['filename'] + ':' + str( emo['cnt'] ) + '\n' ) dts.closeFiles() #tfile = open( '../data/tweets_small.txt', 'r' ) # #for x in range( processTweetSize + 1 ): # line = tfile.readline() # for emo in Emotions: # flag = -2 # for eicon in emo['Icons']: # flag = line.find( eicon ) # if flag != -1 : # break # if flag >= 0: # emo['cnt'] = emo['cnt'] + 1
if ans == int(emo['label']): label = 1 Emotions[ans - 1]['cnt'] += 1 tmp = {u'text': text, u'label': label} emo['fileptr'].write(json.dumps(tmp) + '\n') else: if Emotions[ans - 1]['ncnt'] < Emotions[ans - 1]['cnt']: label = -1 Emotions[ans - 1]['ncnt'] += 1 tmp = {u'text': text, u'label': label} emo['fileptr'].write(json.dumps(tmp) + '\n') pass if __name__ == "__main__": dts.setSize(2000000) dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/test_featre_all', '../log/test_labeled_by_emoji_log') dts.openFiles() for emo in Emotions: emo['fileptr'] = codecs.open(OutputDir + emo['name'], 'w', 'utf-8') dts.loop(__divide, 'divide and label twiiters') for emo in Emotions: print '%s\t:\t%d' % (emo['name'], emo['cnt']) dts.writeL('%s\t:\t%d\n' % (emo['name'], emo['cnt'])) emo['fileptr'].close() dts.closeFiles() pass
def __clean(param): for key, cnt in [(k, v) for k, v in topicDict.iteritems()]: if cnt < param[0]: topicDict.pop(key) if __name__ == "__main__": dts.setSize(13000000) dts.setFile('/home/server2103/dump/twitter.tweet.json', '../emojiOutput/topics', '../log/topics.emoji') dts.openFiles() dts.loop_with_param_clean(__dealLine, __clean, [ 3, ], 'find hashtags') cnt = 0 sum = 0 print 'start output' for key, value in topicDict.iteritems(): dts.writeO('%s\t:%d\n' % (key, value)) cnt += 1 sum += value dts.writeL('%d hashtags with %d displays' % (cnt, sum)) print '%d hashtags with %d displays' % (cnt, sum) dts.closeFiles() pass
""" find emoji in tweets """ import io import os import re import codecs import dealTweets as dts dts.setSize(50000) dts.setFile("../data/tweet_noRT_noDup.txt", "../tmp/b.out", "../tmp/c.out") dts.openFiles() def findemoji(str): line = dts.readlineI() if str in line: print line dts.writeO(line) #dts.loop_with_param( findemoji, u'☺️', u'try to find Emoji :☺️' ) #dts.writeL( '0001F612'.decode('hex').encode('utf-8') ) dts.writeL(u'\xe2\x98\xba\xef\xb8\x8f with hay!') smile = '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8') dts.loop_with_param(findemoji, smile, u'try to find Emoji :' + smile) print '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8').encode('utf-8') dts.closeFiles()