def preAll(): dts.setSize(7000000) dts.setFile('../data/twitter.tweets.json', '../emojiOutput/afterPre.txt', '../log/EmojiPre.log') dts.openFiles() __preProcess() dts.closeFiles()
def __testfind(): dts.setSize(3830000) dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/EmoAll.txt', '../log/divideEmoticons') dts.openFiles() dts.loop(__testEmo, 'test emoji') dts.closeFiles()
def bigram(): dts.setFile('../output/afterPre.txt', '../output/BiDict.txt', '../log/bigram.txt') dts.setSize(25000000) dts.openFiles() make_bigram() dts.closeFiles()
def select_bigram(): dts.setFile('../output/BiDict.txt', '../output/select_bigram.txt', '../log/select_bigram') dts.setSize(389920) dts.openFiles() dts.loop_with_param(__filter_bigram, [100, 100000], 'filter_bigram') dts.closeFiles()
def topicFilter(): dts.setSize(14000000) dts.setFile("/home/server2103/dump/twitter.tweet.json", "../entityOutput/topictwitter", "../log/matchtwitter") dts.openFiles() dts.loop(filterHashtags, 'filterHashtags') dts.closeFiles()
def select_dict(): dts.setFile('../output/Dict_raw.txt', '../output/Dict_select.txt', '../log/idf_select.log') dts.setSize(214884) dts.openFiles() dts.loop_with_param(__filter_range, [1000, 34400], 'filter Dict_raw') dts.closeFiles()
def featureUnigram(): topicList = [ur"#emabiggestfans1d", ur"#emabiggestfansjustinbieber", ur"#p**n", ur"#ipad", ur"#halloween", ur"#emabiggestfans5sos", ur"#stealmygirl", ur"#thewalkingdead", ur"#ebola", ur"#emabiggestfansarianagrande", ur"#lol"] for hashtag in topicList: topic = hashtag[1:] dts.setSize( 50000 ) dts.setFile( '../entityOutput/topictwitter', '../entityOutput/topicTwitter_'+topic, '../log/topicTwitterFeatvect') dts.openFiles() dts.loop_with_param( __dealLine, [ hashtag, ],'Generating Unigram With Tag:'+topic ) dts.closeFiles()
def featureVectorParse(): dts.setSize(10000) dts.setFile('../data/featvect', '../emojiOutput/featureWang10000_no01', '../log/featureWang') dts.openFiles() dts.loop(__lineParse, 'parse featvect') dts.writeL(str(name_dict)) dts.closeFiles()
def divideHashtag(): dts.setSize(1000000) dts.setFile('../hashOutput/afterPre.txt', '../hashOutput/divideHashtag.txt', '../log/divideHashtag.log') dts.openFiles() dts.loop(__divide, 'divide by Hashtag') for emo in EmoList: print 'label %d \t: %d' % (emo['label'], emo['cnt']) dts.writeL('label %d \t: %d\n' % (emo['label'], emo['cnt'])) dts.closeFiles()
def divideEmoticons(): dts.setSize(3830000) dts.setFile('../emojiOutput/EafterPre.txt', '', '../log/divideEmoticons') dts.openFiles() for emo in Emotions: emo['fileptr'] = codecs.open(outputDir + emo['filename'], 'w', 'utf-8') dts.loop_with_param(__divide, [ 3000, ], 'divide Emotions') for emo in Emotions: print '%s\t:\t%d' % (emo['filename'], emo['cnt']) emo['fileptr'].close() dts.closeFiles()
def __cleanDup(): dts.openFiles() tw = set() def __push(): text = dts.readlineI() tw.add(text) dts.loop(__push, 'push into set') print 'start write to file %s' % dts.ofileName cnt = 0 for text in tw: dts.writeO(text) cnt += 1 print 'write finished, tot tweet left: %d' % cnt dts.closeFiles()
def featureGenerator(): dts.setSize(5000) dts.setFile( '../emojiOutput/afterPre.txt', '../emojiOutput/feature5000.txt', '../log/emojiFeatureGenerator.log' ) dts.openFiles() __featureGenerator_init() for emo in devideEmotion.Emotions: filename = devideEmotion.outputDir + emo['filename'] ifile = codecs.open( filename, 'r', 'utf-8' ) #print 'Processing %s:' % emo['filename'] dts.loop_with_param( __g_each_tweet, [emo['label'], ifile] , emo['filename'] ) ifile.close() #dts.loop( __g_each_tweet, 'feature Generator' ) dts.closeFiles()
def featureVectorParse(): topicList = [ur"#emabiggestfans1d", ur"#emabiggestfansjustinbieber", ur"#p**n", ur"#ipad", ur"#halloween", ur"#emabiggestfans5sos", ur"#stealmygirl", ur"#thewalkingdead", ur"#ebola", ur"#emabiggestfansarianagrande", ur"#lol"] dfile = codecs.open( '../log/featureWang', 'r', 'utf-8' ) line = dfile.readline() global name_dict name_dict = eval( line ) dfile.close() for topic in topicList: ifilename = '../entityOutput/topicTwitter_' + topic[1:] ofilename = '../entityOutput/topicFeat_' + topic[1:] lfilename = '../log/featureVectorParse_entity' dts.setSize( 50000 ) dts.setFile( ifilename, ofilename, lfilename ) dts.openFiles() dts.loop( __lineParse, 'parse featvect:' + topic ) dts.closeFiles()
def labelCounter(): dts.setSize(100000) dts.setFile('../data/featvect', '', '../log/featvectLabelCount') dts.openFiles() global counter for x in range(9): counter[x] = 0 dts.loop(__line, 'parse featvect') sum = 0 for x in range(9): sum += counter[x] for x in range(9): print 'Label\t%d\t:%d (%.2f%%)' % ( x, counter[x], float(counter[x] * 100.0) / float(sum)) dts.writeL('Label\t%d\t:%d (%.2f%%)\n' % (x, counter[x], float(counter[x] * 100.0) / float(sum))) print 'Sum\t\t:%d' % sum dts.closeFiles()
def make_dict(): dts.setFile('../output/afterPre.txt', '../output/Dict_raw.txt', '../log/idf.log') dts.setSize(25770000) dts.openFiles() dict = {} dts.loop_with_param(__calcIDF, dict, 'calc the Idf') print 'start sort and print' cnt = 0 pcnt = 0 CntDistribution = {} CNT_MAX = 1000000 for x in range(CNT_MAX + 1): CntDistribution[x] = 0 for key, value in [(k, dict[k]) for k in sorted(dict.keys())]: if value > 10 and value < 364600: dts.writeO('%s:%d\n' % (key, value)) pcnt += 1 cnt += 1 if (value > 364600): print key if (value > CNT_MAX * 10): CntDistribution[CNT_MAX] += 1 else: CntDistribution[value / 10] += 1 print '%d words output' % pcnt dts.writeL('%d words output\n' % pcnt) print 'printing range log' ncnt = 0 for x in range(CNT_MAX): ncnt += CntDistribution[x] dts.writeL('%7d~%7d:\t%d\n' % (x * 10, (x + 1) * 10, cnt - ncnt)) dts.closeFiles()
def filterEmoticons(): dts.setSize( 310000 ) dts.setFile( '../data/tweet_noRT_noDup.txt', '../tmp/filter.out', '../log/filterEmoticons.log' ) dts.openFiles() dts.loop( __cleanTweet, 'clean Tweets' ) dts.closeFiles()
loop_lfilename = '../Compare_Output/ans_unihash_' all_ofilename = '../emojiOutput/feautre_unihash_all' all_lfilename = '../Compare_Output/ans_unihash_all' elif __type == 'UnigramEmoticon_run': DictDir = '../emojiOutput/UnigramEmoticonDict' loop_ofilename = '../emojiOutput/feautre_uniemo_' loop_lfilename = '../Compare_Output/ans_uniemo_' all_ofilename = '../emojiOutput/feautre_uniemo_all' all_lfilename = '../Compare_Output/ans_uniemo_all' load_Index() for Emo in divideByEmoji.Emotions: ifilename = divideByEmoji.OutputDir + Emo['name'] ofilename = loop_ofilename + Emo['name'] lfilename = loop_lfilename + Emo['name'] dts.setSize(100000) dts.setFile(ifilename, ofilename, lfilename) dts.openFiles() PC = 0 dts.loop(parse_line, 'generating ' + Emo['name']) dts.closeFiles() ifilename = '../emojiOutput/featre_all' dts.setSize(100000) dts.setFile(ifilename, all_ofilename, all_lfilename) dts.openFiles() dts.loop(parse_line, 'generating all') dts.closeFiles() pass