Example #1
0
 def testDetect(self):
     
     texts = [
          (u"The quick brown",'en'),
          (u"Le renard brun rapide saute par-dessus le chien paresseux",'fr'),
          (u"@Ja_Nina HERRLICH :) ich hab nix auf planeten gefunden..deine version klingt absolut logisch :D",'de'),
          (u"En Google somos plenamente conscientes de la confianza que los usuarios depositan ",'es'),
          (u"Noi di Google siamo perfettamente consapevoli della fiducia che riponi in noi e della ",'it'),
          (u'русский язык','ru'),
          (u'','other')
     ]
     
     ld = LangDetect(languages = supportedLangs)
     r1 = []
     r2 = []
     for text,lang in texts:
         res =  ld.detect(text)
         r1.append(res[0])
         r2.append(lang)
     assert r1==r2
Example #2
0
    def __init__(self,
                 dataDir = "~",
                 training_data_fileP1 = 'mood_training_p1.dat',
                 training_data_fileP2 = 'mood_training.dat',
                 data_p_file = 'tweets_positive_raw.dat',
                 data_n_file = 'tweets_negative_raw.dat'):

        self.dataDir = dataDir

        self.clsP1 = MoodDetectTrainer(data_file = training_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file = training_data_fileP2)

        self.langClassifier = LangDetect(supportedLangs)

        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()

        self.tweetsPFile = open(os.path.join( self.dataDir,data_p_file),'rb')
        self.tweetsNFile = open(os.path.join( self.dataDir,data_n_file),'rb')

        self.limit['en'] = 150000
        self.limit['default'] = 1000
 def loadCls():
     ThreadedTCPServer.langCls = LangDetect(supportedLangs)
     ThreadedTCPServer.moodCls = MoodDetect(MoodDetectTrainer())
Example #4
0
class RawClassifier(object):
    statsData = {}
    dataDir = "~"
    limit = {}
    skip = 0
    p2_f_limit = 0.75

    def __init__(self,
                 dataDir = "~",
                 training_data_fileP1 = 'mood_training_p1.dat',
                 training_data_fileP2 = 'mood_training.dat',
                 data_p_file = 'tweets_positive_raw.dat',
                 data_n_file = 'tweets_negative_raw.dat'):

        self.dataDir = dataDir

        self.clsP1 = MoodDetectTrainer(data_file = training_data_fileP1)
        self.clsP2 = MoodDetectTrainer(data_file = training_data_fileP2)

        self.langClassifier = LangDetect(supportedLangs)

        self.training_data_p1 = MoodDetectTrainData()
        self.training_data_p2 = MoodDetectTrainData()

        self.tweetsPFile = open(os.path.join( self.dataDir,data_p_file),'rb')
        self.tweetsNFile = open(os.path.join( self.dataDir,data_n_file),'rb')

        self.limit['en'] = 150000
        self.limit['default'] = 1000


    def classifyP1(self,stripSmiles=False):
        self.classifyRaw(self.tweetsNFile,'n',stripSmiles)
        self.classifyRaw(self.tweetsPFile,'p',stripSmiles)
        self.clsP1.train(self.training_data_p1)
        print "done training P1"

        print self.statsData

    def classifyP2(self):
        """
            remove noisy n-grams
        """
        _st={'tf':0,'df':0}

        for feutures,label in self.training_data_p1:

            lang = feutures.pop('x_lang')
            feuturesP2 = feutures.copy()

            for f,v in feutures.items():
               prob = self.clsP1.classifier.prob_classify({f:v,'x_lang':lang})


               _st['tf']+=1

               if max(prob.prob('n'),prob.prob('p')) <= self.p2_f_limit:
                   del feuturesP2[f]
                   _st['df']+=1

            if len(feuturesP2) >= 3:
                feuturesP2['x_lang']=lang
                self.training_data_p2.append((feuturesP2,label))
            else:
                pass

        print len(self.training_data_p2), len(self.training_data_p1)
        print _st

        print "deleting p1 set"
        del self.training_data_p1
        del self.clsP1
        print "Done deleting p1 set"
        self.clsP2.train(self.training_data_p2)


    def stripSmiles(self,text):
        emos = [':)',':-)',';-)',': )',':d','=)',':p',';)','<3',':(',':-(',': (']

        for item in emos:
            text = text.replace(item,"")
        return text

    def stats(self,lang,mood):
        if not self.statsData.has_key(lang):
            self.statsData[lang] = {'n':0,'p':0}

        if self.limit.has_key(lang):
            limit = self.limit[lang]
        else:
            limit = self.limit['default']


        if self.statsData[lang][mood] >= limit:
                return 0
        else:
            self.statsData[lang][mood]+=1
            return 1


    def checkDoubleEmo(self,mood,text):
        if mood == 'n':
            if text.find(':)') != -1:
                return True
            else:
                return False

        if mood == 'p':
            if text.find(':(') != -1:
                return True
            else:
                return False




    def classifyRaw(self,file,mood,stripSmiles):
        while True:
            try:
                tweet = cPickle.load(file)
            except EOFError:
                print "done for %s" % mood
                break
            except:
                pass

            if self.skip > 0:
                self.skip -= 1
                continue

            if tweet:
                text = unicode(tweet.get('text'))

                if text.lower().find('rt ') != -1:
                    continue

                if self.checkDoubleEmo(mood,text):
                    continue


                lang  = self.langClassifier.detect(text)

                if stripSmiles:
                    text = self.stripSmiles(text)

                sres = self.stats(lang[0], mood)
                if sres == 0:
                    continue

                if sres == -1:
                    print "done for %s" % mood
                    break

                self.training_data_p1.addRow(text, mood, lang[0])


    def countRows(self,file):
        rows = 0
        breakes = 0
        while True:
            try:
                tweet = cPickle.load(file)
                rows +=1
            except EOFError:
                break
            except:
                breakes +=1

        print file
        print rows,breakes
Example #5
0
import sys
sys.path.append('../../')
import socket
import os
from tracker.lib.moodClassifierClient import MoodClassifierTCPClient
from tracker.lib.lang_detection import LangDetect
from tracker.lib.supportedLangs import supportedLangs
import cPickle
import linecache

#MCC = MoodClassifierTCPClient('srv1.cyhex.com',6666)
MCC = MoodClassifierTCPClient('127.0.0.1', 6666)

cls_data = {'nc': 1, 'pc': 1, 'n': 1, 'p': 1, 'n#': 1, 'p#': 1}

langClassifier = LangDetect(supportedLangs)

tweetsPFile = "/home/gx/Sites/SMM/trunk/tracker/data/tweets_positive_test.dat"
tweetsNFile = "/home/gx/Sites/SMM/trunk/tracker/data/tweets_negative_test.dat"


def stripSmiles(text):
    emos = [
        ':)', ':-)', ';-)', ': )', ':d', '=)', ':p', ';)', '<3', ':(', ':-(',
        ': ('
    ]

    for item in emos:
        text = text.replace(item, "")
    return text