예제 #1
0
def crossValidationRanking(m,n,dataSet):
    gs = goslate.Goslate()
    error = 0
    iter = 0
    predictedLabel = list()

    while iter < len(dataSet):
        trainSet = dataSet

        testSet = trainSet.pop(iter)

        predictedLabel = list()
        for nGramSize in xrange(2,5):
            allTexts = getAllLanguagesSet(trainSet)
            allFreq = returnNgramFreqSetRanking(allTexts,nGramSize)
            probList = rmc.outofplaceMeasureSet(m,n,allFreq,testSet[0],nGramSize)
            predictedLabel.append(probList.index(max(probList)))
        k=[]
        k = [k for k,v in Counter(predictedLabel).items() if v>1]

        if not k:
           predictedLabelTotal = predictedLabel[1]
        else:
            predictedLabelTotal = k[0]

        #print 'tweet: ' + str(testSet[0])
        print 'predicted: ' + langArray[predictedLabelTotal] + "\ttarget: " + testSet[1]

        iter += 5
        if langArray[predictedLabelTotal] == testSet[1]: error += 0
        else: error += 1

    iter = (iter - 5)/5
    error = error / iter
    return error
예제 #2
0
allTexts = utils.getAllLanguagesSet(dataSet)
allTexts = utils.formatDataset(allTexts)

sentence = sys.argv[1]
sentence = utils.cleanTweets(sentence)

language = llc.lidstoneLanguageClassification(sentence, allTexts)

predictedLabel = list()
m = 80
n = 100

for nGramSize in xrange(2,5):
    # allTexts = utils.getAllLanguagesSet(allTexts)
    allFreq = utils.returnNgramFreqSetRanking(allTexts,nGramSize)
    probList = rmc.outofplaceMeasureSet(m,n,allFreq,sentence,nGramSize)
    predictedLabel.append(probList.index(max(probList)))

k=[]
k = [k for k,v in Counter(predictedLabel).items() if v>1]

if not k:
    predictedLabelTotal = predictedLabel[1]
else:
    predictedLabelTotal = k[0]

#print 'tweet: ' + str(testSet[0])
# print 'predicted: ' + langArray[predictedLabelTotal] + "\ttarget: " + testSet[1]


# print language