Exemple #1
0
def plotProbs():
    binSize = 2
    maxPrice = 100
    dataSplit = 0.70

    allItems = getItems(complete=True, sold=True, genre=ROCK)
    #remove genre parameter to search all genres
    allItems = filterItems(allItems)

    [trainItems, testItems] = splitItemSet(allItems, dataSplit)

    bins = generateBinArray(binSize, maxPrice)
    [actualFinalPrice,
     actualFinalPriceBinned] = getFinalPrices(testItems, bins)

    ##Only have to make these onece
    orderedWordList = generateOrderedWordList(allItems)
    [testMatrix, testCategory
     ] = generateMatrixData(orderedWordList,
                            generateItemTitleList(testItems, orderedWordList))

    predictedFinalPrices = [-1] * len(testItems)
    psold = []
    punsold = []
    predict = []
    for priceCutOff in bins:
        print "Price cut off: ", priceCutOff
        #Have to calculate these at every priceCutOff increment
        [phi_k_unsold, phi_k_sold, p_y0,
         p_y1] = trainOnData(trainItems, orderedWordList, priceCutOff)
        testCategory = generateTrainCategory(
            testItems, priceCutOff)  #actual category for testItems
        [testingSetPredictions, prob_sell,
         prob_wontSell] = makePredictions(testMatrix, phi_k_sold, phi_k_unsold,
                                          p_y0, p_y1)
        #predicted category for testItems [0,1]

        predict.append(testingSetPredictions[2])
        psold.append(prob_sell[2])
        punsold.append(prob_wontSell[2])

        predictedFinalPrices = updatePredictedFinalPrice(
            testItems, predictedFinalPrices, testingSetPredictions,
            priceCutOff, binSize, bins)

    for i in range(len(predictedFinalPrices)):
        if predictedFinalPrices[i] == -1: predictedFinalPrices[i] = bins[-1]

    for i in range(len(predictedFinalPrices)):
        print i, testItems[i][0], testItems[i][START_PRICE], actualFinalPrice[
            i], getBinOf(bins, float(
                testItems[i][START_PRICE])), actualFinalPriceBinned[
                    i], predictedFinalPrices[i], "\t\t", testItems[i][TITLE]

    print "Classification error on testing set is: ", classificationError(
        predictedFinalPrices, actualFinalPriceBinned)
def crossValidate(binSize, num):
    priceCutOff = 15;
    maxPrice = 100;
    dataSplit = 0.70

    #print "Loading items... "
    allItems                = getItems(complete=True,sold=True);   #remove genre parameter to search all genres
    allItems                = filterItems(allItems);
    #allItems                = selectNumItems(allItems, num);
    [trainItems,testItems]  = splitItemSet(allItems,dataSplit);
    trainItems   = selectNumItems(trainItems, num);
    actualFinalPrice = [];
    for row in testItems:
        if didItemSell(row): actualFinalPrice.append(float(row[END_PRICE]));
        else:                actualFinalPrice.append(0);
    
    bins = generateBinArray(binSize,maxPrice);
    actualFinalPriceBinned = binnedFinalPrice(bins,actualFinalPrice);

    ##Only have to make these once
    orderedWordList = generateOrderedWordList(allItems);
    [testMatrix,testCategory]   = generateMatrixData(orderedWordList,generateItemTitleList(testItems,orderedWordList));

    predictedFinalPrice = [-1]*len(testItems);
    for priceCutOff in bins:
        #print "Price cut off: ", priceCutOff
        #Have to calculate these at every priceCutOff increment
        [phi_k_unsold,phi_k_sold]   = trainOnData(trainItems,orderedWordList,priceCutOff);
        testCategory                = generateTrainCategory(testItems,priceCutOff)            #actual category for testItems
        testingSetPredictions       = makePredictions(testMatrix,phi_k_sold,phi_k_unsold);    #predicted category for testItems

        for i in range(len(testItems)):
            if testingSetPredictions[i]==0 and predictedFinalPrice[i]==-1:
                if priceCutOff-binSize>float(testItems[i][END_PRICE]):
                    predictedFinalPrice[i] = priceCutOff-binSize;
                elif priceCutOff-binSize<=float(testItems[i][END_PRICE]):
                    predictedFinalPrice[i] = getBinOf(bins,float(testItems[i][END_PRICE]));
                    if predictedFinalPrice[i]<0: predictedFinalPrice[i]=0;


                #if testItems[i][END_PRICE]>=priceCutOff: predictedFinalPrice[i] = priceCutOff;
                #else: predictedFinalPrice[i] = testItems[i][END_PRICE]; 
    file = open('predictedactualprices.csv', 'w');
    for i in range(len(predictedFinalPrice)):
        if predictedFinalPrice[i] == -1: predictedFinalPrice[i] = bins[-1];
    for item in range(len(predictedFinalPrice)):
        file.write(str(predictedFinalPrice[item])+ ", "+ str(actualFinalPrice[item])+ "\n");
    file.close();
    return classificationError(predictedFinalPrice, actualFinalPriceBinned);
def completePredictions():
    binSize = 5;
    maxPrice = 150;
    dataSplit = 0.85;

    print "Loading items... "
    allItems                = getItems(complete=True,sold=True);   #remove genre parameter to search all genres
    allItems                = filterItems(allItems);

    [trainItems,testItems]  = splitItemSet(allItems,dataSplit);


    bins = BinArray(binSize,maxPrice);
    [actualFinalPrice,actualFinalPriceBinned] = getFinalPrices(testItems,bins);

    ##Only have to make these onece
    orderedWordList = generateOrderedWordList(allItems);
    [testMatrix,testCategory]   = generateMatrixData(orderedWordList,generateItemTitleList(testItems,orderedWordList));

    ##Predict whether it will sell at all or not
    predictedFinalPrices        = [-1]*len(testItems);
    [phi_k_unsold,phi_k_sold,p_y0,p_y1]   = trainOnData(trainItems,orderedWordList,0);
    testCategory                = generateTrainCategory(testItems,0)            #actual category for testItems
    testingSetPredictions       = makePredictions(testMatrix,phi_k_sold,phi_k_unsold,p_y0,p_y1);    #predicted category for testItems [0,1]
    predictedFinalPrices = updatePredictedFinalPrice(testItems,predictedFinalPrices,testingSetPredictions,0,binSize,bins);
    print "Classification error on for sell/not sell is set is: ", classificationError(testingSetPredictions,testCategory);

    predictedFinalPrices = [-1]*len(testItems);
    for priceCutOff in bins:
        #print "Price cut off: ", priceCutOff
        #Have to calculate these at every priceCutOff increment
        [phi_k_unsold,phi_k_sold,p_y0,p_y1]   = trainOnData(trainItems,orderedWordList,priceCutOff);
        testCategory                = generateTrainCategory(testItems,priceCutOff)            #actual category for testItems
        testingSetPredictions       = makePredictions(testMatrix,phi_k_sold,phi_k_unsold,p_y0,p_y1);    #predicted category for testItems [0,1]

        predictedFinalPrices = updatePredictedFinalPrice(testItems,predictedFinalPrices,testingSetPredictions,priceCutOff,binSize,bins);

    for i in range(len(predictedFinalPrices)):
        if predictedFinalPrices[i] == -1: predictedFinalPrices[i] = bins[-1];
        
    for i in range(len(predictedFinalPrices)):
        print testItems[i][0],testItems[i][START_PRICE],actualFinalPrice[i],getBinOf(bins,float(testItems[i][START_PRICE])), actualFinalPriceBinned[i],predictedFinalPrices[i],"\t\t",testItems[i][TITLE]

    print "Classification error on testing set is: ", classificationError(predictedFinalPrices,actualFinalPriceBinned);
def plotProbs():
    binSize = 2;
    maxPrice = 100;
    dataSplit = 0.70;

    allItems                = getItems(complete=True,sold=True,genre=ROCK);   #remove genre parameter to search all genres
    allItems                = filterItems(allItems);

    [trainItems,testItems]  = splitItemSet(allItems,dataSplit);

    bins = generateBinArray(binSize,maxPrice);
    [actualFinalPrice,actualFinalPriceBinned] = getFinalPrices(testItems,bins);

    ##Only have to make these onece
    orderedWordList = generateOrderedWordList(allItems);
    [testMatrix,testCategory]   = generateMatrixData(orderedWordList,generateItemTitleList(testItems,orderedWordList));

    predictedFinalPrices = [-1]*len(testItems);
    psold = [];
    punsold = [];
    predict = [];
    for priceCutOff in bins:
        print "Price cut off: ", priceCutOff
        #Have to calculate these at every priceCutOff increment
        [phi_k_unsold,phi_k_sold,p_y0,p_y1]   = trainOnData(trainItems,orderedWordList,priceCutOff);
        testCategory                = generateTrainCategory(testItems,priceCutOff)            #actual category for testItems
        [testingSetPredictions,prob_sell,prob_wontSell]         = makePredictions(testMatrix,phi_k_sold,phi_k_unsold,p_y0,p_y1);    #predicted category for testItems [0,1]

        predict.append(testingSetPredictions[2]);
        psold.append(prob_sell[2]);
        punsold.append(prob_wontSell[2]);

        predictedFinalPrices = updatePredictedFinalPrice(testItems,predictedFinalPrices,testingSetPredictions,priceCutOff,binSize,bins);

    for i in range(len(predictedFinalPrices)):
        if predictedFinalPrices[i] == -1: predictedFinalPrices[i] = bins[-1];
        
    for i in range(len(predictedFinalPrices)):
        print i,testItems[i][0],testItems[i][START_PRICE],actualFinalPrice[i],getBinOf(bins,float(testItems[i][START_PRICE])), actualFinalPriceBinned[i],predictedFinalPrices[i],"\t\t",testItems[i][TITLE]

    print "Classification error on testing set is: ", classificationError(predictedFinalPrices,actualFinalPriceBinned);
def crossValidate(binSize):
    maxPrice = 100;
    dataSplit = 0.70;

    #print "Loading items... "
    allItems                = getItems(complete=True,sold=True);   #remove genre parameter to search all genres
    allItems                = filterItems(allItems);

    [trainItems,testItems]  = splitItemSet(allItems,dataSplit);

    bins = generateBinArray(binSize,maxPrice);
    [actualFinalPrice,actualFinalPriceBinned] = getFinalPrices(testItems,bins);

    ##Only have to make these onece
    orderedWordList = generateOrderedWordList(allItems);
    [testMatrix,testCategory]   = generateMatrixData(orderedWordList,generateItemTitleList(testItems,orderedWordList));

    predictedFinalPrices = [-1]*len(testItems);
    psold = [];
    punsold = [];
    predict = [];
    for priceCutOff in bins:
        #print "Price cut off: ", priceCutOff
        #Have to calculate these at every priceCutOff increment
        [phi_k_unsold,phi_k_sold,p_y0,p_y1]   = trainOnData(trainItems,orderedWordList,priceCutOff);
        testCategory                = generateTrainCategory(testItems,priceCutOff)            #actual category for testItems
        [testingSetPredictions,prob_sell,prob_wontSell]         = makePredictions(testMatrix,phi_k_sold,phi_k_unsold,p_y0,p_y1);    #predicted category for testItems [0,1]

        predict.append(testingSetPredictions[2]);
        psold.append(prob_sell[2]);
        punsold.append(prob_wontSell[2]);

        predictedFinalPrices = updatePredictedFinalPrice(testItems,predictedFinalPrices,testingSetPredictions,priceCutOff,binSize,bins);

    for i in range(len(predictedFinalPrices)):
        if predictedFinalPrices[i] == -1: predictedFinalPrices[i] = bins[-1];
        
    return classificationError(predictedFinalPrices,actualFinalPriceBinned);