def topicFilter():
    dts.setSize(14000000)
    dts.setFile("/home/server2103/dump/twitter.tweet.json",
                "../entityOutput/topictwitter", "../log/matchtwitter")
    dts.openFiles()
    dts.loop(filterHashtags, 'filterHashtags')
    dts.closeFiles()
def __testfind():
    dts.setSize(3830000)
    dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/EmoAll.txt',
                '../log/divideEmoticons')
    dts.openFiles()
    dts.loop(__testEmo, 'test emoji')
    dts.closeFiles()
def featureVectorParse():
    dts.setSize(10000)
    dts.setFile('../data/featvect', '../emojiOutput/featureWang10000_no01',
                '../log/featureWang')
    dts.openFiles()
    dts.loop(__lineParse, 'parse featvect')

    dts.writeL(str(name_dict))

    dts.closeFiles()
def divideHashtag():
    dts.setSize(1000000)
    dts.setFile('../hashOutput/afterPre.txt',
                '../hashOutput/divideHashtag.txt', '../log/divideHashtag.log')
    dts.openFiles()

    dts.loop(__divide, 'divide by Hashtag')
    for emo in EmoList:
        print 'label %d \t: %d' % (emo['label'], emo['cnt'])
        dts.writeL('label %d \t: %d\n' % (emo['label'], emo['cnt']))

    dts.closeFiles()
Exemple #5
0
def __cleanDup():
    dts.openFiles()
    tw = set()

    def __push():
        text = dts.readlineI()
        tw.add(text)

    dts.loop(__push, 'push into set')
    print 'start write to file %s' % dts.ofileName
    cnt = 0
    for text in tw:
        dts.writeO(text)
        cnt += 1
    print 'write finished, tot tweet left: %d' % cnt

    dts.closeFiles()
def featureVectorParse():
    topicList = [ur"#emabiggestfans1d", ur"#emabiggestfansjustinbieber", ur"#p**n", ur"#ipad", ur"#halloween", ur"#emabiggestfans5sos", ur"#stealmygirl", ur"#thewalkingdead", ur"#ebola", ur"#emabiggestfansarianagrande", ur"#lol"]
    
    dfile = codecs.open( '../log/featureWang', 'r', 'utf-8' )
    line = dfile.readline()
    global name_dict
    name_dict = eval( line )
    dfile.close()

    for topic in topicList:
        ifilename = '../entityOutput/topicTwitter_' + topic[1:]
        ofilename = '../entityOutput/topicFeat_' + topic[1:]
        lfilename = '../log/featureVectorParse_entity'

        dts.setSize( 50000 )
        dts.setFile( ifilename, ofilename, lfilename )
        dts.openFiles()
        dts.loop( __lineParse, 'parse featvect:' + topic )
        dts.closeFiles()
def labelCounter():
    dts.setSize(100000)
    dts.setFile('../data/featvect', '', '../log/featvectLabelCount')
    dts.openFiles()
    global counter
    for x in range(9):
        counter[x] = 0
    dts.loop(__line, 'parse featvect')

    sum = 0
    for x in range(9):
        sum += counter[x]

    for x in range(9):
        print 'Label\t%d\t:%d (%.2f%%)' % (
            x, counter[x], float(counter[x] * 100.0) / float(sum))
        dts.writeL('Label\t%d\t:%d (%.2f%%)\n' %
                   (x, counter[x], float(counter[x] * 100.0) / float(sum)))

    print 'Sum\t\t:%d' % sum

    dts.closeFiles()
            loop_lfilename = '../Compare_Output/ans_unihash_'
            all_ofilename = '../emojiOutput/feautre_unihash_all'
            all_lfilename = '../Compare_Output/ans_unihash_all'
        elif __type == 'UnigramEmoticon_run':
            DictDir = '../emojiOutput/UnigramEmoticonDict'
            loop_ofilename = '../emojiOutput/feautre_uniemo_'
            loop_lfilename = '../Compare_Output/ans_uniemo_'
            all_ofilename = '../emojiOutput/feautre_uniemo_all'
            all_lfilename = '../Compare_Output/ans_uniemo_all'
        load_Index()

        for Emo in divideByEmoji.Emotions:
            ifilename = divideByEmoji.OutputDir + Emo['name']
            ofilename = loop_ofilename + Emo['name']
            lfilename = loop_lfilename + Emo['name']
            dts.setSize(100000)
            dts.setFile(ifilename, ofilename, lfilename)
            dts.openFiles()
            PC = 0
            dts.loop(parse_line, 'generating ' + Emo['name'])
            dts.closeFiles()

        ifilename = '../emojiOutput/featre_all'
        dts.setSize(100000)
        dts.setFile(ifilename, all_ofilename, all_lfilename)
        dts.openFiles()
        dts.loop(parse_line, 'generating all')
        dts.closeFiles()

    pass
"""

import re
import sys
import codecs
import HTMLParser
import preprocess_func
import dealTweets as dts
import utilities
#from nltk.stem.wordnet import WordNetLemmatizer

dts.setSize(25770000)
dts.setFile('../data/tweet_noRT_noDup.txt', '../output/afterPre.txt',
            '../log/pre.log')
dts.openFiles()

tokenizer = utilities.Tokenizer()


def __preprocess():
    line = preprocess_func.preprocess(dts.readlineI())
    dts.writeO(line)


#    terms = [term for term in tokenizer.tokenize(line)]
#    print terms

dts.loop(__preprocess, 'preprocess symbols')

dts.closeFiles()
Exemple #10
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2014 Carwest Sung <*****@*****.**>
#
# Distributed under terms of the MIT license.
"""

"""

import dealTweets as dts
import json


def __io():
    line = dts.readlineI()
    if not line:
        return
    obj = json.loads(line)
    text = obj['text']
    dts.writeO(text + '\n')


dts.setSize(300)
dts.setFile('../emojiOutput/afterPre.txt', '../emojiOutput/checkAfterPre.txt')
dts.openFiles()
dts.loop(__io, 'io')
dts.closeFiles()
Exemple #11
0
    def dealLine():
        line = dts.readlineI()
        for emo in Emotions:
            if emo['cnt'] > MaxEmotionSize:
                continue
            flag = -2
            for eicon in emo['Icons']:
                flag = line.find( eicon )
                if flag != -1 :
                    emo['fileptr'].write( line )
                    break
            if flag >= 0:
                emo['cnt'] = emo['cnt'] + 1

    dts.loop( dealLine, 'check Emoticons' )

    for emo in Emotions:
        emo['fileptr'].close()

    print '============='
    print 'processed Tweets:' + str( dts.processSize )
    for emo in Emotions:
        print emo['filename'] + ':' + str( emo['cnt'] )
        dts.writeL( emo['filename'] + ':' + str( emo['cnt'] ) + '\n' )

    dts.closeFiles()

#tfile = open( '../data/tweets_small.txt', 'r' )
#
#for x in range( processTweetSize + 1 ):
            if ans == int(emo['label']):
                label = 1
                Emotions[ans - 1]['cnt'] += 1
                tmp = {u'text': text, u'label': label}
                emo['fileptr'].write(json.dumps(tmp) + '\n')
            else:
                if Emotions[ans - 1]['ncnt'] < Emotions[ans - 1]['cnt']:
                    label = -1
                    Emotions[ans - 1]['ncnt'] += 1
                    tmp = {u'text': text, u'label': label}
                    emo['fileptr'].write(json.dumps(tmp) + '\n')

    pass


if __name__ == "__main__":
    dts.setSize(2000000)
    dts.setFile('../emojiOutput/afterPre.txt',
                '../emojiOutput/test_featre_all',
                '../log/test_labeled_by_emoji_log')
    dts.openFiles()
    for emo in Emotions:
        emo['fileptr'] = codecs.open(OutputDir + emo['name'], 'w', 'utf-8')
    dts.loop(__divide, 'divide and label twiiters')
    for emo in Emotions:
        print '%s\t:\t%d' % (emo['name'], emo['cnt'])
        dts.writeL('%s\t:\t%d\n' % (emo['name'], emo['cnt']))
        emo['fileptr'].close()
    dts.closeFiles()
    pass
Exemple #13
0
def filterEmoticons():
    dts.setSize( 310000 )
    dts.setFile( '../data/tweet_noRT_noDup.txt', '../tmp/filter.out', '../log/filterEmoticons.log' )
    dts.openFiles()
    dts.loop( __cleanTweet, 'clean Tweets' )
    dts.closeFiles()