def __testfind(): dts.setSize(3830000) dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/EmoAll.txt', '../log/divideEmoticons') dts.openFiles() dts.loop(__testEmo, 'test emoji') dts.closeFiles()
def bigram(): dts.setFile('../output/afterPre.txt', '../output/BiDict.txt', '../log/bigram.txt') dts.setSize(25000000) dts.openFiles() make_bigram() dts.closeFiles()
def preAll(): dts.setSize(7000000) dts.setFile('../data/twitter.tweets.json', '../emojiOutput/afterPre.txt', '../log/EmojiPre.log') dts.openFiles() __preProcess() dts.closeFiles()
def select_bigram(): dts.setFile('../output/BiDict.txt', '../output/select_bigram.txt', '../log/select_bigram') dts.setSize(389920) dts.openFiles() dts.loop_with_param(__filter_bigram, [100, 100000], 'filter_bigram') dts.closeFiles()
def topicFilter(): dts.setSize(14000000) dts.setFile("/home/server2103/dump/twitter.tweet.json", "../entityOutput/topictwitter", "../log/matchtwitter") dts.openFiles() dts.loop(filterHashtags, 'filterHashtags') dts.closeFiles()
def select_dict(): dts.setFile('../output/Dict_raw.txt', '../output/Dict_select.txt', '../log/idf_select.log') dts.setSize(214884) dts.openFiles() dts.loop_with_param(__filter_range, [1000, 34400], 'filter Dict_raw') dts.closeFiles()
def featureVectorParse(): dts.setSize(10000) dts.setFile('../data/featvect', '../emojiOutput/featureWang10000_no01', '../log/featureWang') dts.openFiles() dts.loop(__lineParse, 'parse featvect') dts.writeL(str(name_dict)) dts.closeFiles()
def featureUnigram(): topicList = [ur"#emabiggestfans1d", ur"#emabiggestfansjustinbieber", ur"#p**n", ur"#ipad", ur"#halloween", ur"#emabiggestfans5sos", ur"#stealmygirl", ur"#thewalkingdead", ur"#ebola", ur"#emabiggestfansarianagrande", ur"#lol"] for hashtag in topicList: topic = hashtag[1:] dts.setSize( 50000 ) dts.setFile( '../entityOutput/topictwitter', '../entityOutput/topicTwitter_'+topic, '../log/topicTwitterFeatvect') dts.openFiles() dts.loop_with_param( __dealLine, [ hashtag, ],'Generating Unigram With Tag:'+topic ) dts.closeFiles()
def divideHashtag(): dts.setSize(1000000) dts.setFile('../hashOutput/afterPre.txt', '../hashOutput/divideHashtag.txt', '../log/divideHashtag.log') dts.openFiles() dts.loop(__divide, 'divide by Hashtag') for emo in EmoList: print 'label %d \t: %d' % (emo['label'], emo['cnt']) dts.writeL('label %d \t: %d\n' % (emo['label'], emo['cnt'])) dts.closeFiles()
def divideEmoticons(): dts.setSize(3830000) dts.setFile('../emojiOutput/EafterPre.txt', '', '../log/divideEmoticons') dts.openFiles() for emo in Emotions: emo['fileptr'] = codecs.open(outputDir + emo['filename'], 'w', 'utf-8') dts.loop_with_param(__divide, [ 3000, ], 'divide Emotions') for emo in Emotions: print '%s\t:\t%d' % (emo['filename'], emo['cnt']) emo['fileptr'].close() dts.closeFiles()
def featureGenerator(): dts.setSize(5000) dts.setFile( '../emojiOutput/afterPre.txt', '../emojiOutput/feature5000.txt', '../log/emojiFeatureGenerator.log' ) dts.openFiles() __featureGenerator_init() for emo in devideEmotion.Emotions: filename = devideEmotion.outputDir + emo['filename'] ifile = codecs.open( filename, 'r', 'utf-8' ) #print 'Processing %s:' % emo['filename'] dts.loop_with_param( __g_each_tweet, [emo['label'], ifile] , emo['filename'] ) ifile.close() #dts.loop( __g_each_tweet, 'feature Generator' ) dts.closeFiles()
def featureVectorParse(): topicList = [ur"#emabiggestfans1d", ur"#emabiggestfansjustinbieber", ur"#p**n", ur"#ipad", ur"#halloween", ur"#emabiggestfans5sos", ur"#stealmygirl", ur"#thewalkingdead", ur"#ebola", ur"#emabiggestfansarianagrande", ur"#lol"] dfile = codecs.open( '../log/featureWang', 'r', 'utf-8' ) line = dfile.readline() global name_dict name_dict = eval( line ) dfile.close() for topic in topicList: ifilename = '../entityOutput/topicTwitter_' + topic[1:] ofilename = '../entityOutput/topicFeat_' + topic[1:] lfilename = '../log/featureVectorParse_entity' dts.setSize( 50000 ) dts.setFile( ifilename, ofilename, lfilename ) dts.openFiles() dts.loop( __lineParse, 'parse featvect:' + topic ) dts.closeFiles()
def labelCounter(): dts.setSize(100000) dts.setFile('../data/featvect', '', '../log/featvectLabelCount') dts.openFiles() global counter for x in range(9): counter[x] = 0 dts.loop(__line, 'parse featvect') sum = 0 for x in range(9): sum += counter[x] for x in range(9): print 'Label\t%d\t:%d (%.2f%%)' % ( x, counter[x], float(counter[x] * 100.0) / float(sum)) dts.writeL('Label\t%d\t:%d (%.2f%%)\n' % (x, counter[x], float(counter[x] * 100.0) / float(sum))) print 'Sum\t\t:%d' % sum dts.closeFiles()
def make_dict(): dts.setFile('../output/afterPre.txt', '../output/Dict_raw.txt', '../log/idf.log') dts.setSize(25770000) dts.openFiles() dict = {} dts.loop_with_param(__calcIDF, dict, 'calc the Idf') print 'start sort and print' cnt = 0 pcnt = 0 CntDistribution = {} CNT_MAX = 1000000 for x in range(CNT_MAX + 1): CntDistribution[x] = 0 for key, value in [(k, dict[k]) for k in sorted(dict.keys())]: if value > 10 and value < 364600: dts.writeO('%s:%d\n' % (key, value)) pcnt += 1 cnt += 1 if (value > 364600): print key if (value > CNT_MAX * 10): CntDistribution[CNT_MAX] += 1 else: CntDistribution[value / 10] += 1 print '%d words output' % pcnt dts.writeL('%d words output\n' % pcnt) print 'printing range log' ncnt = 0 for x in range(CNT_MAX): ncnt += CntDistribution[x] dts.writeL('%7d~%7d:\t%d\n' % (x * 10, (x + 1) * 10, cnt - ncnt)) dts.closeFiles()
frindIcons=[ 'x-<', ':-)', '(-:', ':)', '(:',u'☺️'] tensnIcons=[ ':-0', ':-o', ':-()',':-O', ':O','o_O', 'O_o','O_O','o_o'] confuIcons=[ '?_?', '@_@', '<@_@>'] angerEmo = {'filename': 'angerEmo.txt', 'Icons': angerIcons, 'cnt':0} depreEmo = {'filename': 'depreEmo.txt', 'Icons': depreIcons, 'cnt':0} fatigEmo = {'filename': 'fatigEmo.txt', 'Icons': fatigIcons, 'cnt':0} vigorEmo = {'filename': 'vigorEmo.txt', 'Icons': vigorIcons, 'cnt':0} frindEmo = {'filename': 'frindEmo.txt', 'Icons': frindIcons, 'cnt':0} tensnEmo = {'filename': 'tensnEmo.txt', 'Icons': tensnIcons, 'cnt':0} confuEmo = {'filename': 'confuEmo.txt', 'Icons': confuIcons, 'cnt':0} #Emotions = [angerEmo, depreEmo, fatigEmo, vigorEmo, frindEmo, tensnEmo, confuEmo]; dts.setSize( 5000000 ) dts.setFile( '../data/tweet_noRT_noDup.txt' ) dts.openFiles() def dealLine(): line = dts.readlineI() for emo in Emotions: flag = -2 for eicon in emo['Icons']: if eicon in line: print line #flag = line.find( eicon ) flag = 0 break if flag >= 0: emo['cnt'] = emo['cnt'] + 1 dts.loop( dealLine, 'check Emoticons' )
# vim:fenc=utf-8 # # Copyright © 2014 Carwest Sung <*****@*****.**> # # Distributed under terms of the MIT license. """ find emoji in tweets """ import io import os import re import codecs import dealTweets as dts dts.setSize(5000000) dts.setFile("../data/tweet_noRT_noDup.txt", "../tmp/b.out", "../tmp/c.out") dts.openFiles() def findemoji(str): line = dts.readlineI() if str in line: print(line) dts.writeO(line) dts.loop_with_param(findemoji, b'\xf0\x9f\x98\x80'.decode('utf-8'), 'try to find Emoji :😀') #dts.writeL( u'\xe2\x98\xba\xef\xb8\x8f with hay!' ) #smile = '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8')
for words in matchs: words = words.lower() topicDict.update({words: topicDict.get(words, 0) + 1}) #print words def __clean(param): for key, cnt in [(k, v) for k, v in topicDict.iteritems()]: if cnt < param[0]: topicDict.pop(key) if __name__ == "__main__": dts.setSize(13000000) dts.setFile('/home/server2103/dump/twitter.tweet.json', '../emojiOutput/topics', '../log/topics.emoji') dts.openFiles() dts.loop_with_param_clean(__dealLine, __clean, [ 3, ], 'find hashtags') cnt = 0 sum = 0 print 'start output' for key, value in topicDict.iteritems(): dts.writeO('%s\t:%d\n' % (key, value)) cnt += 1 sum += value dts.writeL('%d hashtags with %d displays' % (cnt, sum)) print '%d hashtags with %d displays' % (cnt, sum)
if len( sys.argv ) == 0: print 'no argv given' pass elif len(sys.argv) != 4: print 'error argvs' else: ProcessSize = int(sys.argv[1]) MaxEmotionSize = int(sys.argv[2]) outputDir=sys.argv[3] print 'ProcessSize set to %d, MaxEmotionSize set to %d' % ( ProcessSize, MaxEmotionSize ) print 'outputDir = %s' % outputDir dts.setSize( ProcessSize ) dts.setFile( '../data/tweet_noRT_noDup.txt', '', '../log/dividedByEmoticons_'+str(ProcessSize) + '.log' ) dts.openFiles() for emo in Emotions: emo['fileptr'] = codecs.open( outputDir + emo['filename'], 'w', 'utf-8' ) def dealLine(): line = dts.readlineI() for emo in Emotions: if emo['cnt'] > MaxEmotionSize: continue flag = -2 for eicon in emo['Icons']: flag = line.find( eicon ) if flag != -1 : emo['fileptr'].write( line )
#! /usr/bin/env python # -*- coding: utf-8 -*- # vim:fenc=utf-8 # # Copyright © 2014 Carwest Sung <*****@*****.**> # # Distributed under terms of the MIT license. """ """ import dealTweets as dts import json def __io(): line = dts.readlineI() if not line: return obj = json.loads(line) text = obj['text'] dts.writeO(text + '\n') dts.setSize(300) dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/checkAfterPre.txt') dts.openFiles() dts.loop(__io, 'io') dts.closeFiles()
def filterEmoticons(): dts.setSize( 310000 ) dts.setFile( '../data/tweet_noRT_noDup.txt', '../tmp/filter.out', '../log/filterEmoticons.log' ) dts.openFiles() dts.loop( __cleanTweet, 'clean Tweets' ) dts.closeFiles()
loop_lfilename = '../Compare_Output/ans_unihash_' all_ofilename = '../emojiOutput/feautre_unihash_all' all_lfilename = '../Compare_Output/ans_unihash_all' elif __type == 'UnigramEmoticon_run': DictDir = '../emojiOutput/UnigramEmoticonDict' loop_ofilename = '../emojiOutput/feautre_uniemo_' loop_lfilename = '../Compare_Output/ans_uniemo_' all_ofilename = '../emojiOutput/feautre_uniemo_all' all_lfilename = '../Compare_Output/ans_uniemo_all' load_Index() for Emo in divideByEmoji.Emotions: ifilename = divideByEmoji.OutputDir + Emo['name'] ofilename = loop_ofilename + Emo['name'] lfilename = loop_lfilename + Emo['name'] dts.setSize(100000) dts.setFile(ifilename, ofilename, lfilename) dts.openFiles() PC = 0 dts.loop(parse_line, 'generating ' + Emo['name']) dts.closeFiles() ifilename = '../emojiOutput/featre_all' dts.setSize(100000) dts.setFile(ifilename, all_ofilename, all_lfilename) dts.openFiles() dts.loop(parse_line, 'generating all') dts.closeFiles() pass
Preprocess for tweet file take out RT, url addresses, punctuations, ans paramiters """ import re import sys import codecs import HTMLParser import preprocess_func import dealTweets as dts import utilities #from nltk.stem.wordnet import WordNetLemmatizer dts.setSize(25770000) dts.setFile('../data/tweet_noRT_noDup.txt', '../output/afterPre.txt', '../log/pre.log') dts.openFiles() tokenizer = utilities.Tokenizer() def __preprocess(): line = preprocess_func.preprocess(dts.readlineI()) dts.writeO(line) # terms = [term for term in tokenizer.tokenize(line)] # print terms dts.loop(__preprocess, 'preprocess symbols')
if ans == int(emo['label']): label = 1 Emotions[ans - 1]['cnt'] += 1 tmp = {u'text': text, u'label': label} emo['fileptr'].write(json.dumps(tmp) + '\n') else: if Emotions[ans - 1]['ncnt'] < Emotions[ans - 1]['cnt']: label = -1 Emotions[ans - 1]['ncnt'] += 1 tmp = {u'text': text, u'label': label} emo['fileptr'].write(json.dumps(tmp) + '\n') pass if __name__ == "__main__": dts.setSize(2000000) dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/test_featre_all', '../log/test_labeled_by_emoji_log') dts.openFiles() for emo in Emotions: emo['fileptr'] = codecs.open(OutputDir + emo['name'], 'w', 'utf-8') dts.loop(__divide, 'divide and label twiiters') for emo in Emotions: print '%s\t:\t%d' % (emo['name'], emo['cnt']) dts.writeL('%s\t:\t%d\n' % (emo['name'], emo['cnt'])) emo['fileptr'].close() dts.closeFiles() pass
#! /usr/bin/env python # -*- coding: utf-8 -*- # vim:fenc=utf-8 # # Copyright © 2014 Carwest Sung <*****@*****.**> # # Distributed under terms of the MIT license. """ clean duplicate tweet , with simple set method """ import dealTweets as dts dts.setSize(5000000) dts.setFile('../data/tweet_noRT.txt', '../tmp/noDup.txt', '../log/checkNoDup.log') def __cleanDup(): dts.openFiles() tw = set() def __push(): text = dts.readlineI() tw.add(text) dts.loop(__push, 'push into set') print 'start write to file %s' % dts.ofileName cnt = 0 for text in tw: dts.writeO(text)