def crossValidation(tweetList, k,maxNgram): m=80 n=50 for i in xrange(k): trainSet,testSet = divideDataset(tweetList,k,i) trainDist = utils.obtainNgrams(trainSet,maxNgram) confidenceDict=utils.learnNgramConfidencefromData(trainDist,trainSet) predicted, true=utils.evaluateNgramRakingSet(testSet,trainDist, confidenceDict,m,n) # utils.printJeroni(true,predicted,i) utils.printResults(testSet, predicted, i)
def crossValidationLinearInterpolation(tweetList, k, maxNgram): for i in xrange(k): trainSet, testSet = divideDataset(tweetList, k, i) trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(trainSet, maxNgram) linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, maxNgram, trainDist) print linearCoefficients count = 0 tot = 0 for tweet in testSet: predictedLanguage, probability = linear.getPredictedLanguageForTweet(linearCoefficients, tweet.text, maxNgram, trainDist) utils.printResultTXT(predictedLanguage, tweet) if(predictedLanguage == tweet.language): count = count + 1; tot = tot +1 # print str(count)+'/'+str(tot) print 'correct tweets fold '+str(i)+' = '+str(count)+'/'+str(tot)
def outofplaceMeasureSet(m, n, freqDistSet, testText,ngramSize): probList = list() # Get test freq Dist freqDistTest = utils.getBigramFreqForSingleLang(testText,ngramSize) for freqDist in freqDistSet: n = min(n,len(freqDistTest)) m = min(m,len(freqDist)) probList.append(outofplaceMeasure(m, n, freqDist,freqDistTest, testText,ngramSize)) listSum=sum(probList) if listSum == 0: listSum = 1 for i in xrange(0,len(probList)): probList[i] = probList[i]/listSum return probList
def getPredictedLanguageForTweet(linearCoefficients, text, maxNgram, corpusNgrams): maxProbability = 0 average = 0 predicted = dict() # import time # t0 = time.time() for linearCoefficient in linearCoefficients: prob, language = languageProbability(text, maxNgram, corpusNgrams, linearCoefficient) predicted[language] = prob if prob >= maxProbability: maxLanguage = language maxProbability = prob # sys.stdout.write("Sequence probability in "+str(language)+": "+str(prob)+"\n") # print "time for 1 probability= "+str(time.time()-t0) average = np.mean(predicted.values()) threshold = (maxProbability-average)/10 # print 'threshold '+str(threshold) languageSumed = utils.chooseLanguagesLin(predicted, threshold) # languageSumed = maxLanguage return languageSumed, maxProbability
import UtilsTweetSafa as utils import LidstonLanguageClassification as llc import sys import RankingModelClassification as rmc from collections import Counter # sentence = 'Once upon a time there was a cat who wore boots' #sentence = 'It is known for being the first to print many English manuscripts, including Cotton Nero A.x, which contains Pearl, Sir Gawain and the Green Knight, and other poems.' # sentence = 'O portugues foi usado, naquela epoca,' # sentence = "una frase en espanol, es una prueba de que el programa funcione" # sentence = 'La France metropolitaine possede une grande variete de paysages, entre des plaines agricoles ou boisees, des chaines de montagnes plus ou moins erodees, des littoraux diversifies et des vallees melant villes et espaces neo-naturels.' # sentence = 'today i will go home with my brother and sister because i like it, the mountain is a thing in english' dataSet = utils.createDataSet("datasets/en_tweets.txt","datasets/es_tweets.txt","datasets/fr_tweets.txt","datasets/pt_tweets.txt") allTexts = utils.getAllLanguagesSet(dataSet) allTexts = utils.formatDataset(allTexts) sentence = sys.argv[1] sentence = utils.cleanTweets(sentence) language = llc.lidstoneLanguageClassification(sentence, allTexts) predictedLabel = list() m = 80 n = 100 for nGramSize in xrange(2,5):
import UtilsTweetSafa as utils #sentence = 'Once upon a time there was a cat who wore boots' #sentence = 'this is a foo bar sentences and i want to ngramize it' #sentence = 'O portugues foi usado, naquela epoca,' #sentence = "una frase en espanol, es una mierda de programa" #sentence = 'La France metropolitaine possede une grande variete de paysages, entre des plaines agricoles ou boisees, des chaines de montagnes plus ou moins erodees, des littoraux diversifies et des vallees melant villes et espaces neo-naturels.' dataSet = utils.createDataSet("datasets/en_tweets.txt","datasets/es_tweets.txt","datasets/fr_tweets.txt","datasets/pt_tweets.txt") error = utils.crossValidationLidstone(dataSet) print 'Lidstone error = ' + str(error) cleanDataSet = utils.cleanDataset(dataSet) m = 80 n = 100 error = utils.crossValidationRanking(m, n, cleanDataSet) print 'Ranking model error = ' + str(error)
def lidstoneLanguageClassification(sentence, allTexts): nbr = 0 ntr = 0 freqDistSetUni = utils.returnNgramFreqSet(allTexts, 1) freqDistSetBi = utils.returnNgramFreqSet(allTexts, 2) freqDistSetTri = utils.returnNgramFreqSet(allTexts, 3) # Language # 0 = English # 1 = Spanish # 2 = French # 3 = Portuguese results = [] filas = 4 columnas = 2 lamda = 0.1 for i in range(filas): results.append([0] * columnas) for language in range(0, 4): # Count Unigrams (CUR), bigrams (CBR) and total observations (NBR) for i in xrange(0, len(freqDistSetUni[language].items())): cur[freqDistSetUni[language].items()[i][0][0]] = freqDistSetUni[language].items()[i][1] for j in xrange(0, len(freqDistSetBi[language].items())): cbr[freqDistSetBi[language].items()[j][0][0] + freqDistSetBi[language].items()[j][0][1]] = freqDistSetBi[ language ].items()[j][1] nbr = nbr + freqDistSetBi[language].items()[j][1] ## Compute input probability Bigram prob = 1.0 for i in range(0, len(sentence) - 1): x = sentence[i] y = sentence[i + 1] pt = plidstone(x + y, cur, cbr, nbr, lamda, 1) prob = prob * pt # sys.stdout.write((x+y).encode("utf-8")+" "+str(pt)+" "+str(prob)+"\n") # sys.stdout.write(str(language)+"language Sequence probability: "+str(prob)+"\n") results[language][0] = float(prob) # Count Bigrams (CBR), trigrams (CTR) and the total observations (NTR) for i in xrange(0, len(freqDistSetBi[language].items())): cbr[freqDistSetBi[language].items()[i][0][0] + freqDistSetBi[language].items()[i][0][1]] = freqDistSetBi[ language ].items()[i][1] for j in xrange(0, len(freqDistSetTri[language].items())): ctr[ freqDistSetTri[language].items()[j][0][0] + freqDistSetTri[language].items()[j][0][1] + freqDistSetTri[language].items()[j][0][2] ] = freqDistSetTri[language].items()[j][1] ntr = ntr + freqDistSetTri[language].items()[j][1] ## Compute input probability Trigram prob = 1.0 for i in range(0, len(sentence) - 2): x = sentence[i] y = sentence[i + 1] z = sentence[i + 2] pt = plidstone(x + y + z, cbr, ctr, ntr, lamda, 0) prob = prob * pt # sys.stdout.write((x+y+z).encode("utf-8")+" "+str(pt)+" "+str(prob)+"\n") # sys.stdout.write(str(language)+"language Sequence probability: "+str(prob)+"\n") results[language][1] = float(prob) max = results[0][0] maxi = 0 maxj = 0 for i in range(0, filas): for j in range(0, columnas): if results[i][j] > max: max = results[i][j] maxi = i maxj = j if j == 0: sys.stdout.write(str(i) + "language Sequence probability bigrams: " + str(results[i][j]) + "\n") else: sys.stdout.write(str(i) + "language Sequence probability trigrams: " + str(results[i][j]) + "\n") print results[maxi][maxj] print maxi return maxi
tweetListPreProcessed_train = preprocess.main(tweetList_train) tweetListPreProcessed_test = preprocess.main(tweetList_test) # shuffle(tweetListPreProcessed) # 3-. Algorithms # 3.1-. Algorithms: Bayesian Networks # 3.2.1-. Linear interpolation # Generate linear coefficients: input (n-grams and language) # Smooth data # cv.crossValidationLinearInterpolation(tweetListPreProcessed_train, 3, maxNgram) linearCoefficientsAll = list() trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(tweetListPreProcessed_train, maxNgram) for gram in xrange(1, maxNgram+1): linearCoefficientsAll.append(linear.getlinearcoefficientsForLanguageArray(arrayLanguages, gram, trainDist)) print linearCoefficientsAll # linearCoefficientsALL = read.readLinearCoefficients(LI_Coefficients) count = 4 # Desde que gram empezar for i in xrange(count, maxNgram): count = count + 1 t0 = time.time() for tweet in tweetListPreProcessed_test:
tweetListtest = read.read_tweets_dataset(test) # 2-. Pre-process state tweetListPreProcessed = preprocess.main(tweetList) tweetListPreProcessedtest= preprocess.main(tweetListtest) shuffle(tweetListPreProcessed) # Raw data -> tweetList # Clean data -> tweetListPreProcessed #utils.printTweets(tweetListPreProcessed) # 3-. Algorithms # # 3.1-. OBTAIN N-GRAMS corpusNgrams, arrayLanguages, arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, maxNgram+1) arrayLanguagesFull = utils.orderVector(arrayLanguagesFull) # Example: print(corpusNgrams.get(str(3)).get('pt')) # 3.2-. Algorithms: Bayesian Networks # 3.2.1-. Linear interpolation # Generate linear coefficients: input (n-grams and language) # Smooth data tweetEN = "Tomorrow is going to be a good day to go to the beach." tweetPT = "Amanhã será um dia muito bom, como ir para a praia." tweetCA = "Demà farà un dia molt bo, com per anar a la platja." tweetEU = "Bihar egun oso ona egingo du, hondartzara joateko modukoa."
# _____________________________________________________________________________ # 1-. Read dataset and create tweetList fullfilled of Tweet object* dataset = sys.argv[1] maxNgram = int(sys.argv[2]) filename = os.path.basename(dataset).split('.') tweetList = read.read_tweets_dataset(dataset) # 2-. Pre-process state # Raw data -> tweetList # Clean data -> tweetListPreProcessed tweetListPreProcessed = preprocess.main(tweetList) # 3-. OBTAIN N-GRAMS and Linear Coefficients for i in xrange(5, maxNgram+1): corpusNgrams, arrayLanguages,arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, i+1) linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, i, corpusNgrams) # print linearCoefficients file = open('../Dataset/LICoefficients_'+str(maxNgram)+'gram_for-'+str(filename[0])+'.txt', 'a+') for li in linearCoefficients: file.write(str(i)+"\t"+str(li[0])) for co in xrange(1, i+1): file.write("\t"+str(li[co])) file.write("\n") file.close()
tweetListtest = read.read_tweets_dataset(test) # 2-. Pre-process state tweetListPreProcessed = preprocess.main(tweetList) tweetListPreProcessedtest = preprocess.main(tweetListtest) shuffle(tweetListPreProcessed) # Raw data -> tweetList # Clean data -> tweetListPreProcessed #utils.printTweets(tweetListPreProcessed) # 3-. Algorithms # # 3.1-. OBTAIN N-GRAMS corpusNgrams, arrayLanguages, arrayLanguagesFull = utils.obtainNgrams( tweetListPreProcessed, maxNgram + 1) arrayLanguagesFull = utils.orderVector(arrayLanguagesFull) # Example: print(corpusNgrams.get(str(3)).get('pt')) # 3.2-. Algorithms: Bayesian Networks # 3.2.1-. Linear interpolation # Generate linear coefficients: input (n-grams and language) # Smooth data tweetEN = "Tomorrow is going to be a good day to go to the beach." tweetPT = "Amanhã será um dia muito bom, como ir para a praia." tweetCA = "Demà farà un dia molt bo, com per anar a la platja." tweetEU = "Bihar egun oso ona egingo du, hondartzara joateko modukoa." tweetGL = "Mañá será un día moi bo, como ir á praia." tweetES = "Mañana hará un dia muy bueno, como para ir a la playa."