def topicFilter(): dts.setSize(14000000) dts.setFile("/home/server2103/dump/twitter.tweet.json", "../entityOutput/topictwitter", "../log/matchtwitter") dts.openFiles() dts.loop(filterHashtags, 'filterHashtags') dts.closeFiles()
def __testfind(): dts.setSize(3830000) dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/EmoAll.txt', '../log/divideEmoticons') dts.openFiles() dts.loop(__testEmo, 'test emoji') dts.closeFiles()
def featureVectorParse(): dts.setSize(10000) dts.setFile('../data/featvect', '../emojiOutput/featureWang10000_no01', '../log/featureWang') dts.openFiles() dts.loop(__lineParse, 'parse featvect') dts.writeL(str(name_dict)) dts.closeFiles()
def divideHashtag(): dts.setSize(1000000) dts.setFile('../hashOutput/afterPre.txt', '../hashOutput/divideHashtag.txt', '../log/divideHashtag.log') dts.openFiles() dts.loop(__divide, 'divide by Hashtag') for emo in EmoList: print 'label %d \t: %d' % (emo['label'], emo['cnt']) dts.writeL('label %d \t: %d\n' % (emo['label'], emo['cnt'])) dts.closeFiles()
def __cleanDup(): dts.openFiles() tw = set() def __push(): text = dts.readlineI() tw.add(text) dts.loop(__push, 'push into set') print 'start write to file %s' % dts.ofileName cnt = 0 for text in tw: dts.writeO(text) cnt += 1 print 'write finished, tot tweet left: %d' % cnt dts.closeFiles()
def featureVectorParse(): topicList = [ur"#emabiggestfans1d", ur"#emabiggestfansjustinbieber", ur"#p**n", ur"#ipad", ur"#halloween", ur"#emabiggestfans5sos", ur"#stealmygirl", ur"#thewalkingdead", ur"#ebola", ur"#emabiggestfansarianagrande", ur"#lol"] dfile = codecs.open( '../log/featureWang', 'r', 'utf-8' ) line = dfile.readline() global name_dict name_dict = eval( line ) dfile.close() for topic in topicList: ifilename = '../entityOutput/topicTwitter_' + topic[1:] ofilename = '../entityOutput/topicFeat_' + topic[1:] lfilename = '../log/featureVectorParse_entity' dts.setSize( 50000 ) dts.setFile( ifilename, ofilename, lfilename ) dts.openFiles() dts.loop( __lineParse, 'parse featvect:' + topic ) dts.closeFiles()
def labelCounter(): dts.setSize(100000) dts.setFile('../data/featvect', '', '../log/featvectLabelCount') dts.openFiles() global counter for x in range(9): counter[x] = 0 dts.loop(__line, 'parse featvect') sum = 0 for x in range(9): sum += counter[x] for x in range(9): print 'Label\t%d\t:%d (%.2f%%)' % ( x, counter[x], float(counter[x] * 100.0) / float(sum)) dts.writeL('Label\t%d\t:%d (%.2f%%)\n' % (x, counter[x], float(counter[x] * 100.0) / float(sum))) print 'Sum\t\t:%d' % sum dts.closeFiles()
loop_lfilename = '../Compare_Output/ans_unihash_' all_ofilename = '../emojiOutput/feautre_unihash_all' all_lfilename = '../Compare_Output/ans_unihash_all' elif __type == 'UnigramEmoticon_run': DictDir = '../emojiOutput/UnigramEmoticonDict' loop_ofilename = '../emojiOutput/feautre_uniemo_' loop_lfilename = '../Compare_Output/ans_uniemo_' all_ofilename = '../emojiOutput/feautre_uniemo_all' all_lfilename = '../Compare_Output/ans_uniemo_all' load_Index() for Emo in divideByEmoji.Emotions: ifilename = divideByEmoji.OutputDir + Emo['name'] ofilename = loop_ofilename + Emo['name'] lfilename = loop_lfilename + Emo['name'] dts.setSize(100000) dts.setFile(ifilename, ofilename, lfilename) dts.openFiles() PC = 0 dts.loop(parse_line, 'generating ' + Emo['name']) dts.closeFiles() ifilename = '../emojiOutput/featre_all' dts.setSize(100000) dts.setFile(ifilename, all_ofilename, all_lfilename) dts.openFiles() dts.loop(parse_line, 'generating all') dts.closeFiles() pass
""" import re import sys import codecs import HTMLParser import preprocess_func import dealTweets as dts import utilities #from nltk.stem.wordnet import WordNetLemmatizer dts.setSize(25770000) dts.setFile('../data/tweet_noRT_noDup.txt', '../output/afterPre.txt', '../log/pre.log') dts.openFiles() tokenizer = utilities.Tokenizer() def __preprocess(): line = preprocess_func.preprocess(dts.readlineI()) dts.writeO(line) # terms = [term for term in tokenizer.tokenize(line)] # print terms dts.loop(__preprocess, 'preprocess symbols') dts.closeFiles()
#! /usr/bin/env python # -*- coding: utf-8 -*- # vim:fenc=utf-8 # # Copyright © 2014 Carwest Sung <*****@*****.**> # # Distributed under terms of the MIT license. """ """ import dealTweets as dts import json def __io(): line = dts.readlineI() if not line: return obj = json.loads(line) text = obj['text'] dts.writeO(text + '\n') dts.setSize(300) dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/checkAfterPre.txt') dts.openFiles() dts.loop(__io, 'io') dts.closeFiles()
def dealLine(): line = dts.readlineI() for emo in Emotions: if emo['cnt'] > MaxEmotionSize: continue flag = -2 for eicon in emo['Icons']: flag = line.find( eicon ) if flag != -1 : emo['fileptr'].write( line ) break if flag >= 0: emo['cnt'] = emo['cnt'] + 1 dts.loop( dealLine, 'check Emoticons' ) for emo in Emotions: emo['fileptr'].close() print '=============' print 'processed Tweets:' + str( dts.processSize ) for emo in Emotions: print emo['filename'] + ':' + str( emo['cnt'] ) dts.writeL( emo['filename'] + ':' + str( emo['cnt'] ) + '\n' ) dts.closeFiles() #tfile = open( '../data/tweets_small.txt', 'r' ) # #for x in range( processTweetSize + 1 ):
if ans == int(emo['label']): label = 1 Emotions[ans - 1]['cnt'] += 1 tmp = {u'text': text, u'label': label} emo['fileptr'].write(json.dumps(tmp) + '\n') else: if Emotions[ans - 1]['ncnt'] < Emotions[ans - 1]['cnt']: label = -1 Emotions[ans - 1]['ncnt'] += 1 tmp = {u'text': text, u'label': label} emo['fileptr'].write(json.dumps(tmp) + '\n') pass if __name__ == "__main__": dts.setSize(2000000) dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/test_featre_all', '../log/test_labeled_by_emoji_log') dts.openFiles() for emo in Emotions: emo['fileptr'] = codecs.open(OutputDir + emo['name'], 'w', 'utf-8') dts.loop(__divide, 'divide and label twiiters') for emo in Emotions: print '%s\t:\t%d' % (emo['name'], emo['cnt']) dts.writeL('%s\t:\t%d\n' % (emo['name'], emo['cnt'])) emo['fileptr'].close() dts.closeFiles() pass
def filterEmoticons(): dts.setSize( 310000 ) dts.setFile( '../data/tweet_noRT_noDup.txt', '../tmp/filter.out', '../log/filterEmoticons.log' ) dts.openFiles() dts.loop( __cleanTweet, 'clean Tweets' ) dts.closeFiles()