Example #1
0
def select_bigram():
    dts.setFile('../output/BiDict.txt', '../output/select_bigram.txt',
                '../log/select_bigram')
    dts.setSize(389920)
    dts.openFiles()
    dts.loop_with_param(__filter_bigram, [100, 100000], 'filter_bigram')
    dts.closeFiles()
Example #2
0
def __preProcess():
    dts.loop_with_param(__readin, tweets, 'loading files')
    for tweet in tweets:
        #dts.writeO( tmp + '\n' )
        tmp = {u'text': tweet}
        dts.writeO(json.dumps(tmp) + '\n')
    setlen = len(tweets)
    print '%d tweets remaining' % setlen
    dts.writeL('%d tweets remaining' % setlen)
Example #3
0
def select_dict():
    dts.setFile('../output/Dict_raw.txt', '../output/Dict_select.txt',
                '../log/idf_select.log')
    dts.setSize(214884)
    dts.openFiles()

    dts.loop_with_param(__filter_range, [1000, 34400], 'filter Dict_raw')

    dts.closeFiles()
def featureUnigram():
    topicList = [ur"#emabiggestfans1d", ur"#emabiggestfansjustinbieber", ur"#p**n", ur"#ipad", ur"#halloween", ur"#emabiggestfans5sos", ur"#stealmygirl", ur"#thewalkingdead", ur"#ebola", ur"#emabiggestfansarianagrande", ur"#lol"]
    
    for hashtag in topicList:
        topic = hashtag[1:]
        dts.setSize( 50000 )
        dts.setFile( '../entityOutput/topictwitter', '../entityOutput/topicTwitter_'+topic, '../log/topicTwitterFeatvect')
        dts.openFiles()
        dts.loop_with_param( __dealLine, [ hashtag, ],'Generating Unigram With Tag:'+topic )
        dts.closeFiles()
Example #5
0
def divideEmoticons():
    dts.setSize(3830000)
    dts.setFile('../emojiOutput/EafterPre.txt', '', '../log/divideEmoticons')
    dts.openFiles()

    for emo in Emotions:
        emo['fileptr'] = codecs.open(outputDir + emo['filename'], 'w', 'utf-8')

    dts.loop_with_param(__divide, [
        3000,
    ], 'divide Emotions')

    for emo in Emotions:
        print '%s\t:\t%d' % (emo['filename'], emo['cnt'])
        emo['fileptr'].close()
    dts.closeFiles()
def featureGenerator():
    dts.setSize(5000)
    dts.setFile( '../emojiOutput/afterPre.txt', '../emojiOutput/feature5000.txt', '../log/emojiFeatureGenerator.log' )
    dts.openFiles()

    __featureGenerator_init()

    for emo in devideEmotion.Emotions:
        filename = devideEmotion.outputDir + emo['filename']
        ifile = codecs.open( filename, 'r', 'utf-8' )
        #print 'Processing %s:' % emo['filename']
        dts.loop_with_param( __g_each_tweet, [emo['label'], ifile] , emo['filename']  )
        ifile.close()

    #dts.loop( __g_each_tweet, 'feature Generator' )

    dts.closeFiles()
Example #7
0
def make_dict():
    dts.setFile('../output/afterPre.txt', '../output/Dict_raw.txt',
                '../log/idf.log')
    dts.setSize(25770000)
    dts.openFiles()

    dict = {}
    dts.loop_with_param(__calcIDF, dict, 'calc the Idf')

    print 'start sort and print'
    cnt = 0
    pcnt = 0
    CntDistribution = {}
    CNT_MAX = 1000000
    for x in range(CNT_MAX + 1):
        CntDistribution[x] = 0
    for key, value in [(k, dict[k]) for k in sorted(dict.keys())]:
        if value > 10 and value < 364600:
            dts.writeO('%s:%d\n' % (key, value))
            pcnt += 1
        cnt += 1
        if (value > 364600):
            print key
        if (value > CNT_MAX * 10):
            CntDistribution[CNT_MAX] += 1
        else:
            CntDistribution[value / 10] += 1

    print '%d words output' % pcnt
    dts.writeL('%d words output\n' % pcnt)

    print 'printing range log'
    ncnt = 0
    for x in range(CNT_MAX):
        ncnt += CntDistribution[x]
        dts.writeL('%7d~%7d:\t%d\n' % (x * 10, (x + 1) * 10, cnt - ncnt))

    dts.closeFiles()
Example #8
0
"""
find emoji in tweets
"""
import io
import os
import re
import codecs
import dealTweets as dts

dts.setSize(5000000)
dts.setFile("../data/tweet_noRT_noDup.txt", "../tmp/b.out", "../tmp/c.out")
dts.openFiles()


def findemoji(str):
    line = dts.readlineI()
    if str in line:
        print(line)
        dts.writeO(line)


dts.loop_with_param(findemoji, b'\xf0\x9f\x98\x80'.decode('utf-8'),
                    'try to find Emoji :😀')
#dts.writeL( u'\xe2\x98\xba\xef\xb8\x8f with hay!' )
#smile = '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8')

#dts.loop_with_param( findemoji, smile, u'try to find Emoji :' + smile)
#print '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8').encode('utf-8')

dts.closeFiles()
Example #9
0
"""
find emoji in tweets
"""
import io
import os
import re
import codecs
import dealTweets as dts

dts.setSize(50000)
dts.setFile("../data/tweet_noRT_noDup.txt", "../tmp/b.out", "../tmp/c.out")
dts.openFiles()


def findemoji(str):
    line = dts.readlineI()
    if str in line:
        print line
        dts.writeO(line)


#dts.loop_with_param( findemoji, u'☺️', u'try to find Emoji :☺️' )
#dts.writeL( '0001F612'.decode('hex').encode('utf-8') )
dts.writeL(u'\xe2\x98\xba\xef\xb8\x8f with hay!')
smile = '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8')

dts.loop_with_param(findemoji, smile, u'try to find Emoji :' + smile)
print '\xe2\x98\xba\xef\xb8\x8f'.decode('utf-8').encode('utf-8')

dts.closeFiles()