def crossValidationRanking(m,n,dataSet): gs = goslate.Goslate() error = 0 iter = 0 predictedLabel = list() while iter < len(dataSet): trainSet = dataSet testSet = trainSet.pop(iter) predictedLabel = list() for nGramSize in xrange(2,5): allTexts = getAllLanguagesSet(trainSet) allFreq = returnNgramFreqSetRanking(allTexts,nGramSize) probList = rmc.outofplaceMeasureSet(m,n,allFreq,testSet[0],nGramSize) predictedLabel.append(probList.index(max(probList))) k=[] k = [k for k,v in Counter(predictedLabel).items() if v>1] if not k: predictedLabelTotal = predictedLabel[1] else: predictedLabelTotal = k[0] #print 'tweet: ' + str(testSet[0]) print 'predicted: ' + langArray[predictedLabelTotal] + "\ttarget: " + testSet[1] iter += 5 if langArray[predictedLabelTotal] == testSet[1]: error += 0 else: error += 1 iter = (iter - 5)/5 error = error / iter return error
allTexts = utils.getAllLanguagesSet(dataSet) allTexts = utils.formatDataset(allTexts) sentence = sys.argv[1] sentence = utils.cleanTweets(sentence) language = llc.lidstoneLanguageClassification(sentence, allTexts) predictedLabel = list() m = 80 n = 100 for nGramSize in xrange(2,5): # allTexts = utils.getAllLanguagesSet(allTexts) allFreq = utils.returnNgramFreqSetRanking(allTexts,nGramSize) probList = rmc.outofplaceMeasureSet(m,n,allFreq,sentence,nGramSize) predictedLabel.append(probList.index(max(probList))) k=[] k = [k for k,v in Counter(predictedLabel).items() if v>1] if not k: predictedLabelTotal = predictedLabel[1] else: predictedLabelTotal = k[0] #print 'tweet: ' + str(testSet[0]) # print 'predicted: ' + langArray[predictedLabelTotal] + "\ttarget: " + testSet[1] # print language