def plotProbs(): binSize = 2 maxPrice = 100 dataSplit = 0.70 allItems = getItems(complete=True, sold=True, genre=ROCK) #remove genre parameter to search all genres allItems = filterItems(allItems) [trainItems, testItems] = splitItemSet(allItems, dataSplit) bins = generateBinArray(binSize, maxPrice) [actualFinalPrice, actualFinalPriceBinned] = getFinalPrices(testItems, bins) ##Only have to make these onece orderedWordList = generateOrderedWordList(allItems) [testMatrix, testCategory ] = generateMatrixData(orderedWordList, generateItemTitleList(testItems, orderedWordList)) predictedFinalPrices = [-1] * len(testItems) psold = [] punsold = [] predict = [] for priceCutOff in bins: print "Price cut off: ", priceCutOff #Have to calculate these at every priceCutOff increment [phi_k_unsold, phi_k_sold, p_y0, p_y1] = trainOnData(trainItems, orderedWordList, priceCutOff) testCategory = generateTrainCategory( testItems, priceCutOff) #actual category for testItems [testingSetPredictions, prob_sell, prob_wontSell] = makePredictions(testMatrix, phi_k_sold, phi_k_unsold, p_y0, p_y1) #predicted category for testItems [0,1] predict.append(testingSetPredictions[2]) psold.append(prob_sell[2]) punsold.append(prob_wontSell[2]) predictedFinalPrices = updatePredictedFinalPrice( testItems, predictedFinalPrices, testingSetPredictions, priceCutOff, binSize, bins) for i in range(len(predictedFinalPrices)): if predictedFinalPrices[i] == -1: predictedFinalPrices[i] = bins[-1] for i in range(len(predictedFinalPrices)): print i, testItems[i][0], testItems[i][START_PRICE], actualFinalPrice[ i], getBinOf(bins, float( testItems[i][START_PRICE])), actualFinalPriceBinned[ i], predictedFinalPrices[i], "\t\t", testItems[i][TITLE] print "Classification error on testing set is: ", classificationError( predictedFinalPrices, actualFinalPriceBinned)
def crossValidate(binSize, num): priceCutOff = 15; maxPrice = 100; dataSplit = 0.70 #print "Loading items... " allItems = getItems(complete=True,sold=True); #remove genre parameter to search all genres allItems = filterItems(allItems); #allItems = selectNumItems(allItems, num); [trainItems,testItems] = splitItemSet(allItems,dataSplit); trainItems = selectNumItems(trainItems, num); actualFinalPrice = []; for row in testItems: if didItemSell(row): actualFinalPrice.append(float(row[END_PRICE])); else: actualFinalPrice.append(0); bins = generateBinArray(binSize,maxPrice); actualFinalPriceBinned = binnedFinalPrice(bins,actualFinalPrice); ##Only have to make these once orderedWordList = generateOrderedWordList(allItems); [testMatrix,testCategory] = generateMatrixData(orderedWordList,generateItemTitleList(testItems,orderedWordList)); predictedFinalPrice = [-1]*len(testItems); for priceCutOff in bins: #print "Price cut off: ", priceCutOff #Have to calculate these at every priceCutOff increment [phi_k_unsold,phi_k_sold] = trainOnData(trainItems,orderedWordList,priceCutOff); testCategory = generateTrainCategory(testItems,priceCutOff) #actual category for testItems testingSetPredictions = makePredictions(testMatrix,phi_k_sold,phi_k_unsold); #predicted category for testItems for i in range(len(testItems)): if testingSetPredictions[i]==0 and predictedFinalPrice[i]==-1: if priceCutOff-binSize>float(testItems[i][END_PRICE]): predictedFinalPrice[i] = priceCutOff-binSize; elif priceCutOff-binSize<=float(testItems[i][END_PRICE]): predictedFinalPrice[i] = getBinOf(bins,float(testItems[i][END_PRICE])); if predictedFinalPrice[i]<0: predictedFinalPrice[i]=0; #if testItems[i][END_PRICE]>=priceCutOff: predictedFinalPrice[i] = priceCutOff; #else: predictedFinalPrice[i] = testItems[i][END_PRICE]; file = open('predictedactualprices.csv', 'w'); for i in range(len(predictedFinalPrice)): if predictedFinalPrice[i] == -1: predictedFinalPrice[i] = bins[-1]; for item in range(len(predictedFinalPrice)): file.write(str(predictedFinalPrice[item])+ ", "+ str(actualFinalPrice[item])+ "\n"); file.close(); return classificationError(predictedFinalPrice, actualFinalPriceBinned);
def completePredictions(): binSize = 5; maxPrice = 150; dataSplit = 0.85; print "Loading items... " allItems = getItems(complete=True,sold=True); #remove genre parameter to search all genres allItems = filterItems(allItems); [trainItems,testItems] = splitItemSet(allItems,dataSplit); bins = BinArray(binSize,maxPrice); [actualFinalPrice,actualFinalPriceBinned] = getFinalPrices(testItems,bins); ##Only have to make these onece orderedWordList = generateOrderedWordList(allItems); [testMatrix,testCategory] = generateMatrixData(orderedWordList,generateItemTitleList(testItems,orderedWordList)); ##Predict whether it will sell at all or not predictedFinalPrices = [-1]*len(testItems); [phi_k_unsold,phi_k_sold,p_y0,p_y1] = trainOnData(trainItems,orderedWordList,0); testCategory = generateTrainCategory(testItems,0) #actual category for testItems testingSetPredictions = makePredictions(testMatrix,phi_k_sold,phi_k_unsold,p_y0,p_y1); #predicted category for testItems [0,1] predictedFinalPrices = updatePredictedFinalPrice(testItems,predictedFinalPrices,testingSetPredictions,0,binSize,bins); print "Classification error on for sell/not sell is set is: ", classificationError(testingSetPredictions,testCategory); predictedFinalPrices = [-1]*len(testItems); for priceCutOff in bins: #print "Price cut off: ", priceCutOff #Have to calculate these at every priceCutOff increment [phi_k_unsold,phi_k_sold,p_y0,p_y1] = trainOnData(trainItems,orderedWordList,priceCutOff); testCategory = generateTrainCategory(testItems,priceCutOff) #actual category for testItems testingSetPredictions = makePredictions(testMatrix,phi_k_sold,phi_k_unsold,p_y0,p_y1); #predicted category for testItems [0,1] predictedFinalPrices = updatePredictedFinalPrice(testItems,predictedFinalPrices,testingSetPredictions,priceCutOff,binSize,bins); for i in range(len(predictedFinalPrices)): if predictedFinalPrices[i] == -1: predictedFinalPrices[i] = bins[-1]; for i in range(len(predictedFinalPrices)): print testItems[i][0],testItems[i][START_PRICE],actualFinalPrice[i],getBinOf(bins,float(testItems[i][START_PRICE])), actualFinalPriceBinned[i],predictedFinalPrices[i],"\t\t",testItems[i][TITLE] print "Classification error on testing set is: ", classificationError(predictedFinalPrices,actualFinalPriceBinned);
def plotProbs(): binSize = 2; maxPrice = 100; dataSplit = 0.70; allItems = getItems(complete=True,sold=True,genre=ROCK); #remove genre parameter to search all genres allItems = filterItems(allItems); [trainItems,testItems] = splitItemSet(allItems,dataSplit); bins = generateBinArray(binSize,maxPrice); [actualFinalPrice,actualFinalPriceBinned] = getFinalPrices(testItems,bins); ##Only have to make these onece orderedWordList = generateOrderedWordList(allItems); [testMatrix,testCategory] = generateMatrixData(orderedWordList,generateItemTitleList(testItems,orderedWordList)); predictedFinalPrices = [-1]*len(testItems); psold = []; punsold = []; predict = []; for priceCutOff in bins: print "Price cut off: ", priceCutOff #Have to calculate these at every priceCutOff increment [phi_k_unsold,phi_k_sold,p_y0,p_y1] = trainOnData(trainItems,orderedWordList,priceCutOff); testCategory = generateTrainCategory(testItems,priceCutOff) #actual category for testItems [testingSetPredictions,prob_sell,prob_wontSell] = makePredictions(testMatrix,phi_k_sold,phi_k_unsold,p_y0,p_y1); #predicted category for testItems [0,1] predict.append(testingSetPredictions[2]); psold.append(prob_sell[2]); punsold.append(prob_wontSell[2]); predictedFinalPrices = updatePredictedFinalPrice(testItems,predictedFinalPrices,testingSetPredictions,priceCutOff,binSize,bins); for i in range(len(predictedFinalPrices)): if predictedFinalPrices[i] == -1: predictedFinalPrices[i] = bins[-1]; for i in range(len(predictedFinalPrices)): print i,testItems[i][0],testItems[i][START_PRICE],actualFinalPrice[i],getBinOf(bins,float(testItems[i][START_PRICE])), actualFinalPriceBinned[i],predictedFinalPrices[i],"\t\t",testItems[i][TITLE] print "Classification error on testing set is: ", classificationError(predictedFinalPrices,actualFinalPriceBinned);
def crossValidate(binSize): maxPrice = 100; dataSplit = 0.70; #print "Loading items... " allItems = getItems(complete=True,sold=True); #remove genre parameter to search all genres allItems = filterItems(allItems); [trainItems,testItems] = splitItemSet(allItems,dataSplit); bins = generateBinArray(binSize,maxPrice); [actualFinalPrice,actualFinalPriceBinned] = getFinalPrices(testItems,bins); ##Only have to make these onece orderedWordList = generateOrderedWordList(allItems); [testMatrix,testCategory] = generateMatrixData(orderedWordList,generateItemTitleList(testItems,orderedWordList)); predictedFinalPrices = [-1]*len(testItems); psold = []; punsold = []; predict = []; for priceCutOff in bins: #print "Price cut off: ", priceCutOff #Have to calculate these at every priceCutOff increment [phi_k_unsold,phi_k_sold,p_y0,p_y1] = trainOnData(trainItems,orderedWordList,priceCutOff); testCategory = generateTrainCategory(testItems,priceCutOff) #actual category for testItems [testingSetPredictions,prob_sell,prob_wontSell] = makePredictions(testMatrix,phi_k_sold,phi_k_unsold,p_y0,p_y1); #predicted category for testItems [0,1] predict.append(testingSetPredictions[2]); psold.append(prob_sell[2]); punsold.append(prob_wontSell[2]); predictedFinalPrices = updatePredictedFinalPrice(testItems,predictedFinalPrices,testingSetPredictions,priceCutOff,binSize,bins); for i in range(len(predictedFinalPrices)): if predictedFinalPrices[i] == -1: predictedFinalPrices[i] = bins[-1]; return classificationError(predictedFinalPrices,actualFinalPriceBinned);