def crossValidation(tweetList, k,maxNgram):
    m=80
    n=50
    for i in xrange(k):

        trainSet,testSet = divideDataset(tweetList,k,i)
        trainDist = utils.obtainNgrams(trainSet,maxNgram)
        confidenceDict=utils.learnNgramConfidencefromData(trainDist,trainSet)
        predicted, true=utils.evaluateNgramRakingSet(testSet,trainDist, confidenceDict,m,n)
        # utils.printJeroni(true,predicted,i)
        utils.printResults(testSet, predicted, i)
def crossValidationLinearInterpolation(tweetList, k, maxNgram):
    for i in xrange(k):
        trainSet, testSet = divideDataset(tweetList, k, i)
        trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(trainSet, maxNgram)
        linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, maxNgram, trainDist)
        print linearCoefficients
        count = 0
        tot = 0

        for tweet in testSet:
            predictedLanguage, probability = linear.getPredictedLanguageForTweet(linearCoefficients, tweet.text, maxNgram, trainDist)
            utils.printResultTXT(predictedLanguage, tweet)

            if(predictedLanguage == tweet.language):
                count = count + 1;
            tot = tot +1
            # print str(count)+'/'+str(tot)
        print 'correct tweets fold '+str(i)+' = '+str(count)+'/'+str(tot)
def outofplaceMeasureSet(m, n, freqDistSet, testText,ngramSize):
    probList = list()
    # Get test freq Dist
    freqDistTest = utils.getBigramFreqForSingleLang(testText,ngramSize)
    for freqDist in freqDistSet:
        n = min(n,len(freqDistTest))
        m = min(m,len(freqDist))
        probList.append(outofplaceMeasure(m, n, freqDist,freqDistTest, testText,ngramSize))
    listSum=sum(probList)
    if listSum == 0:
        listSum = 1
    for i in xrange(0,len(probList)):
        probList[i] = probList[i]/listSum
    return probList
Example #4
0
def getPredictedLanguageForTweet(linearCoefficients, text, maxNgram, corpusNgrams):
    maxProbability = 0
    average = 0
    predicted = dict()
    # import time
    # t0 = time.time()

    for linearCoefficient in linearCoefficients:
        prob, language = languageProbability(text, maxNgram, corpusNgrams, linearCoefficient)
        predicted[language] = prob
        if prob >= maxProbability:
            maxLanguage = language
            maxProbability = prob
        # sys.stdout.write("Sequence probability in "+str(language)+": "+str(prob)+"\n")
    # print "time for 1 probability= "+str(time.time()-t0)

    average = np.mean(predicted.values())
    threshold = (maxProbability-average)/10
    # print 'threshold '+str(threshold)
    languageSumed = utils.chooseLanguagesLin(predicted, threshold)
    # languageSumed = maxLanguage
    return languageSumed, maxProbability
Example #5
0
import UtilsTweetSafa as utils
import LidstonLanguageClassification as llc
import sys
import RankingModelClassification as rmc
from collections import Counter


# sentence = 'Once upon a time there was a cat who wore boots'
#sentence = 'It is known for being the first to print many English manuscripts, including Cotton Nero A.x, which contains Pearl, Sir Gawain and the Green Knight, and other poems.'
# sentence = 'O portugues foi usado, naquela epoca,'
# sentence = "una frase en espanol, es una prueba de que el programa funcione"
# sentence = 'La France metropolitaine possede une grande variete de paysages, entre des plaines agricoles ou boisees, des chaines de montagnes plus ou moins erodees, des littoraux diversifies et des vallees melant villes et espaces neo-naturels.'
# sentence = 'today i will go home with my brother and sister because i like it, the mountain is a thing in english'

dataSet = utils.createDataSet("datasets/en_tweets.txt","datasets/es_tweets.txt","datasets/fr_tweets.txt","datasets/pt_tweets.txt")


allTexts = utils.getAllLanguagesSet(dataSet)
allTexts = utils.formatDataset(allTexts)

sentence = sys.argv[1]
sentence = utils.cleanTweets(sentence)

language = llc.lidstoneLanguageClassification(sentence, allTexts)

predictedLabel = list()
m = 80
n = 100

for nGramSize in xrange(2,5):
Example #6
0
import UtilsTweetSafa as utils


#sentence = 'Once upon a time there was a cat who wore boots'
#sentence = 'this is a foo bar sentences and i want to ngramize it'
#sentence = 'O portugues foi usado, naquela epoca,'
#sentence = "una frase en espanol, es una mierda de programa"
#sentence = 'La France metropolitaine possede une grande variete de paysages, entre des plaines agricoles ou boisees, des chaines de montagnes plus ou moins erodees, des littoraux diversifies et des vallees melant villes et espaces neo-naturels.'

dataSet = utils.createDataSet("datasets/en_tweets.txt","datasets/es_tweets.txt","datasets/fr_tweets.txt","datasets/pt_tweets.txt")

error = utils.crossValidationLidstone(dataSet)
print 'Lidstone error = ' + str(error)

cleanDataSet = utils.cleanDataset(dataSet)
m = 80
n = 100

error = utils.crossValidationRanking(m, n, cleanDataSet)


print 'Ranking model error = ' + str(error)

def lidstoneLanguageClassification(sentence, allTexts):

    nbr = 0
    ntr = 0
    freqDistSetUni = utils.returnNgramFreqSet(allTexts, 1)
    freqDistSetBi = utils.returnNgramFreqSet(allTexts, 2)
    freqDistSetTri = utils.returnNgramFreqSet(allTexts, 3)

    # Language
    #   0 = English
    #   1 = Spanish
    #   2 = French
    #   3 = Portuguese

    results = []
    filas = 4
    columnas = 2
    lamda = 0.1

    for i in range(filas):
        results.append([0] * columnas)

    for language in range(0, 4):

        # Count Unigrams (CUR), bigrams (CBR) and total observations (NBR)

        for i in xrange(0, len(freqDistSetUni[language].items())):
            cur[freqDistSetUni[language].items()[i][0][0]] = freqDistSetUni[language].items()[i][1]
        for j in xrange(0, len(freqDistSetBi[language].items())):
            cbr[freqDistSetBi[language].items()[j][0][0] + freqDistSetBi[language].items()[j][0][1]] = freqDistSetBi[
                language
            ].items()[j][1]
            nbr = nbr + freqDistSetBi[language].items()[j][1]

        ## Compute input probability Bigram

        prob = 1.0
        for i in range(0, len(sentence) - 1):
            x = sentence[i]
            y = sentence[i + 1]
            pt = plidstone(x + y, cur, cbr, nbr, lamda, 1)
            prob = prob * pt
            # sys.stdout.write((x+y).encode("utf-8")+" "+str(pt)+" "+str(prob)+"\n")
        # sys.stdout.write(str(language)+"language Sequence probability: "+str(prob)+"\n")

        results[language][0] = float(prob)

        # Count Bigrams (CBR), trigrams (CTR) and the total observations (NTR)

        for i in xrange(0, len(freqDistSetBi[language].items())):
            cbr[freqDistSetBi[language].items()[i][0][0] + freqDistSetBi[language].items()[i][0][1]] = freqDistSetBi[
                language
            ].items()[i][1]
        for j in xrange(0, len(freqDistSetTri[language].items())):
            ctr[
                freqDistSetTri[language].items()[j][0][0]
                + freqDistSetTri[language].items()[j][0][1]
                + freqDistSetTri[language].items()[j][0][2]
            ] = freqDistSetTri[language].items()[j][1]
            ntr = ntr + freqDistSetTri[language].items()[j][1]

        ## Compute input probability Trigram

        prob = 1.0
        for i in range(0, len(sentence) - 2):
            x = sentence[i]
            y = sentence[i + 1]
            z = sentence[i + 2]
            pt = plidstone(x + y + z, cbr, ctr, ntr, lamda, 0)
            prob = prob * pt
            # sys.stdout.write((x+y+z).encode("utf-8")+" "+str(pt)+" "+str(prob)+"\n")
        #  sys.stdout.write(str(language)+"language Sequence probability: "+str(prob)+"\n")
        results[language][1] = float(prob)

    max = results[0][0]
    maxi = 0
    maxj = 0
    for i in range(0, filas):
        for j in range(0, columnas):
            if results[i][j] > max:
                max = results[i][j]
                maxi = i
                maxj = j
            if j == 0:
                sys.stdout.write(str(i) + "language Sequence probability bigrams: " + str(results[i][j]) + "\n")
            else:
                sys.stdout.write(str(i) + "language Sequence probability trigrams: " + str(results[i][j]) + "\n")

    print results[maxi][maxj]
    print maxi
    return maxi
tweetListPreProcessed_train = preprocess.main(tweetList_train)
tweetListPreProcessed_test = preprocess.main(tweetList_test)
# shuffle(tweetListPreProcessed)

# 3-. Algorithms

# 3.1-. Algorithms: Bayesian Networks
#   3.2.1-. Linear interpolation
#       Generate linear coefficients: input (n-grams and language)
#       Smooth data

# cv.crossValidationLinearInterpolation(tweetListPreProcessed_train, 3, maxNgram)
linearCoefficientsAll = list()

trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(tweetListPreProcessed_train, maxNgram)
for gram in xrange(1, maxNgram+1):
    linearCoefficientsAll.append(linear.getlinearcoefficientsForLanguageArray(arrayLanguages, gram, trainDist))

print linearCoefficientsAll

# linearCoefficientsALL = read.readLinearCoefficients(LI_Coefficients)


count = 4 # Desde que gram empezar

for i in xrange(count, maxNgram):
    count = count + 1
    t0 = time.time()

    for tweet in tweetListPreProcessed_test:
Example #9
0
tweetListtest = read.read_tweets_dataset(test)
# 2-. Pre-process state

tweetListPreProcessed = preprocess.main(tweetList)
tweetListPreProcessedtest= preprocess.main(tweetListtest)
shuffle(tweetListPreProcessed)
    # Raw data -> tweetList
    # Clean data -> tweetListPreProcessed

#utils.printTweets(tweetListPreProcessed)

# 3-. Algorithms
#
# 3.1-. OBTAIN N-GRAMS

corpusNgrams, arrayLanguages, arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, maxNgram+1)
arrayLanguagesFull = utils.orderVector(arrayLanguagesFull)

# Example:  print(corpusNgrams.get(str(3)).get('pt'))


# 3.2-. Algorithms: Bayesian Networks
#   3.2.1-. Linear interpolation
#       Generate linear coefficients: input (n-grams and language)
#       Smooth data


tweetEN = "Tomorrow is going to be a good day to go to the beach."
tweetPT = "Amanhã será um dia muito bom, como ir para a praia."
tweetCA = "Demà farà un dia molt bo, com per anar a la platja."
tweetEU = "Bihar egun oso ona egingo du, hondartzara joateko modukoa."
# _____________________________________________________________________________


# 1-. Read dataset and create tweetList fullfilled of Tweet object*

dataset = sys.argv[1]
maxNgram = int(sys.argv[2])

filename = os.path.basename(dataset).split('.')

tweetList = read.read_tweets_dataset(dataset)

# 2-. Pre-process state
    # Raw data -> tweetList
    # Clean data -> tweetListPreProcessed
tweetListPreProcessed = preprocess.main(tweetList)

# 3-. OBTAIN N-GRAMS and Linear Coefficients

for i in xrange(5, maxNgram+1):
    corpusNgrams, arrayLanguages,arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, i+1)
    linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, i, corpusNgrams)
    # print linearCoefficients
    file = open('../Dataset/LICoefficients_'+str(maxNgram)+'gram_for-'+str(filename[0])+'.txt', 'a+')
    for li in linearCoefficients:
        file.write(str(i)+"\t"+str(li[0]))
        for co in xrange(1, i+1):
            file.write("\t"+str(li[co]))
        file.write("\n")
file.close()
Example #11
0
tweetListtest = read.read_tweets_dataset(test)
# 2-. Pre-process state

tweetListPreProcessed = preprocess.main(tweetList)
tweetListPreProcessedtest = preprocess.main(tweetListtest)
shuffle(tweetListPreProcessed)
# Raw data -> tweetList
# Clean data -> tweetListPreProcessed

#utils.printTweets(tweetListPreProcessed)

# 3-. Algorithms
#
# 3.1-. OBTAIN N-GRAMS

corpusNgrams, arrayLanguages, arrayLanguagesFull = utils.obtainNgrams(
    tweetListPreProcessed, maxNgram + 1)
arrayLanguagesFull = utils.orderVector(arrayLanguagesFull)

# Example:  print(corpusNgrams.get(str(3)).get('pt'))

# 3.2-. Algorithms: Bayesian Networks
#   3.2.1-. Linear interpolation
#       Generate linear coefficients: input (n-grams and language)
#       Smooth data

tweetEN = "Tomorrow is going to be a good day to go to the beach."
tweetPT = "Amanhã será um dia muito bom, como ir para a praia."
tweetCA = "Demà farà un dia molt bo, com per anar a la platja."
tweetEU = "Bihar egun oso ona egingo du, hondartzara joateko modukoa."
tweetGL = "Mañá será un día moi bo, como ir á praia."
tweetES = "Mañana hará un dia muy bueno, como para ir a la playa."