def crossValidation(tweetList, k,maxNgram): m=80 n=50 for i in xrange(k): trainSet,testSet = divideDataset(tweetList,k,i) trainDist = utils.obtainNgrams(trainSet,maxNgram) confidenceDict=utils.learnNgramConfidencefromData(trainDist,trainSet) predicted, true=utils.evaluateNgramRakingSet(testSet,trainDist, confidenceDict,m,n) # utils.printJeroni(true,predicted,i) utils.printResults(testSet, predicted, i)
def crossValidationLinearInterpolation(tweetList, k, maxNgram): for i in xrange(k): trainSet, testSet = divideDataset(tweetList, k, i) trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(trainSet, maxNgram) linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, maxNgram, trainDist) print linearCoefficients count = 0 tot = 0 for tweet in testSet: predictedLanguage, probability = linear.getPredictedLanguageForTweet(linearCoefficients, tweet.text, maxNgram, trainDist) utils.printResultTXT(predictedLanguage, tweet) if(predictedLanguage == tweet.language): count = count + 1; tot = tot +1 # print str(count)+'/'+str(tot) print 'correct tweets fold '+str(i)+' = '+str(count)+'/'+str(tot)
tweetListPreProcessed_train = preprocess.main(tweetList_train) tweetListPreProcessed_test = preprocess.main(tweetList_test) # shuffle(tweetListPreProcessed) # 3-. Algorithms # 3.1-. Algorithms: Bayesian Networks # 3.2.1-. Linear interpolation # Generate linear coefficients: input (n-grams and language) # Smooth data # cv.crossValidationLinearInterpolation(tweetListPreProcessed_train, 3, maxNgram) linearCoefficientsAll = list() trainDist, arrayLanguages, languagesAll = utils.obtainNgrams(tweetListPreProcessed_train, maxNgram) for gram in xrange(1, maxNgram+1): linearCoefficientsAll.append(linear.getlinearcoefficientsForLanguageArray(arrayLanguages, gram, trainDist)) print linearCoefficientsAll # linearCoefficientsALL = read.readLinearCoefficients(LI_Coefficients) count = 4 # Desde que gram empezar for i in xrange(count, maxNgram): count = count + 1 t0 = time.time() for tweet in tweetListPreProcessed_test:
tweetListtest = read.read_tweets_dataset(test) # 2-. Pre-process state tweetListPreProcessed = preprocess.main(tweetList) tweetListPreProcessedtest= preprocess.main(tweetListtest) shuffle(tweetListPreProcessed) # Raw data -> tweetList # Clean data -> tweetListPreProcessed #utils.printTweets(tweetListPreProcessed) # 3-. Algorithms # # 3.1-. OBTAIN N-GRAMS corpusNgrams, arrayLanguages, arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, maxNgram+1) arrayLanguagesFull = utils.orderVector(arrayLanguagesFull) # Example: print(corpusNgrams.get(str(3)).get('pt')) # 3.2-. Algorithms: Bayesian Networks # 3.2.1-. Linear interpolation # Generate linear coefficients: input (n-grams and language) # Smooth data tweetEN = "Tomorrow is going to be a good day to go to the beach." tweetPT = "Amanhã será um dia muito bom, como ir para a praia." tweetCA = "Demà farà un dia molt bo, com per anar a la platja." tweetEU = "Bihar egun oso ona egingo du, hondartzara joateko modukoa."
# _____________________________________________________________________________ # 1-. Read dataset and create tweetList fullfilled of Tweet object* dataset = sys.argv[1] maxNgram = int(sys.argv[2]) filename = os.path.basename(dataset).split('.') tweetList = read.read_tweets_dataset(dataset) # 2-. Pre-process state # Raw data -> tweetList # Clean data -> tweetListPreProcessed tweetListPreProcessed = preprocess.main(tweetList) # 3-. OBTAIN N-GRAMS and Linear Coefficients for i in xrange(5, maxNgram+1): corpusNgrams, arrayLanguages,arrayLanguagesFull = utils.obtainNgrams(tweetListPreProcessed, i+1) linearCoefficients = linear.getlinearcoefficientsForLanguageArray(arrayLanguages, i, corpusNgrams) # print linearCoefficients file = open('../Dataset/LICoefficients_'+str(maxNgram)+'gram_for-'+str(filename[0])+'.txt', 'a+') for li in linearCoefficients: file.write(str(i)+"\t"+str(li[0])) for co in xrange(1, i+1): file.write("\t"+str(li[co])) file.write("\n") file.close()