{ "tagName": tagName, "max": int(max), "min": int(min), "mean" : int(mean), "std": float(std), "loanType": loanType, "version": version } ) # Step11: get pre-processed category data categoryInfo = mongoDataProcessor.getCategoryInfo() input = (npCategoryHeader, npCategoryData) preCategoryHeader, preCategoryData = bpModelDataPrcocessor.getPreProcessedFlattenCategoryData(input, categoryInfo) with open('Data/PreProcessedFlattenCategpryData.csv', 'w') as f: print >> f, ','.join(preCategoryHeader).encode('utf-8') np.savetxt(f, preCategoryData, delimiter=',', fmt='%d') # please note the fmt arg CategoryTagHeader = dataBase.CategoryTagHeader CategoryTagHeader.remove({"loanType":loanType}) for index in range(len(preCategoryHeader)): CategoryTagHeader.insert_one( { "tagName": preCategoryHeader[index], "updateTime": int(time.time()), "version": version, "loanType": loanType, "index" : index
categoryTagIndex.append(header.index(tagName)) else: categoryTagIndex.append(-1) for row in data: rowData = [] for index in categoryTagIndex: if index >= 0: rowData.append(row[index]) else: rowData.append(missingValue) categoryDataTmp.append(rowData) categoryData = np.array(categoryDataTmp, dtype=np.int) categoryInfo = mongoDataProcessor.getCategoryInfo() flattenCategoryData = bpModelDataPrcocessor.getPreProcessedFlattenCategoryData( (categoryHeader, categoryData), categoryInfo)[1] with open('Data/ResultFlattenCategoryData.csv', 'w') as f: f.write(','.join(flattenCategoryHeader) + '\n') np.savetxt(f, flattenCategoryData, delimiter=',', fmt='%d') # Step4: get flatten numerical data numStats = dataBase.NumStats meanStdMap = {} mscollection = numStats.find({}) for item in mscollection: std = item['std'] mean = item['mean'] tagName = item['tagName'] meanStdMap[tagName] = (mean, std) numericalTagIndex = [] for tagName in numericalHeader:
#replace missing value(-9) with mean, just for numerical data npNumericalData = util.replaceMissingValueWithMean(npNumericalData) np.savetxt('./Data/replaceMissingNumericalData.csv',npNumericalData,header=','.join(numericalHeader).encode('utf-8'),delimiter=',', fmt='%.4f',comments='') #delete the column with constant values(std) categoryHeader, npCategoryData = util.deleteColumnWithConstantValue((categoryHeader, npCategoryData)) numericalHeader, npNumericalData = util.deleteColumnWithConstantValue((numericalHeader, npNumericalData)) np.savetxt('./Data/reducedCategoryData.csv',npCategoryData,header=','.join(categoryHeader).encode('utf-8'),delimiter=',', fmt='%d',comments='') np.savetxt('./Data/reducedNumericalData.csv',npNumericalData,header=','.join(numericalHeader).encode('utf-8'),delimiter=',', fmt='%.4f',comments='') # z-score format zscoreNumericalData = preprocessing.scale(npNumericalData) np.savetxt('./Data/zscoreNumericalData.csv',zscoreNumericalData,header=','.join(numericalHeader).encode('utf-8'),delimiter=',', fmt='%.4f',comments='') # bitformat for categoryData preCategoryHeader, preCategoryData = bpModelDataPrcocessor.getPreProcessedFlattenCategoryData((categoryHeader, npCategoryData), mongoDataProcessor.getCategoryInfo()) np.savetxt('./Data/preCategoryData.csv',preCategoryData,header=','.join(preCategoryHeader).encode('utf-8'),delimiter=',', fmt='%d',comments='') # PCA+KMeans for numericalData PCAResultMap, transResultMap = bpModelDataPrcocessor.getAssociatedMapFromPCA((numericalHeader, zscoreNumericalData)) kmLists = bpModelDataPrcocessor.getKMeansListByCalculation((numericalHeader, zscoreNumericalData), (PCAResultMap, transResultMap), path='./Figures/') bpModelDataPrcocessor.saveKMeansListToFile('./KMeansModel/', numericalHeader) preNumericalHeader, preNumericalData = bpModelDataPrcocessor.getPreProcessedFlattenNumericalData((numericalHeader, zscoreNumericalData), dropTags=[]) np.savetxt('./Data/preNumericalData.csv',preNumericalData,header=','.join(preNumericalHeader).encode('utf-8'),delimiter=',', fmt='%d',comments='') # ANN bPModelTrainer = BPModelTrainer( flattenCategoryHeader=preCategoryHeader, flattenCategoryData=preCategoryData, flattenNumericalHeader=preNumericalHeader, flattenNumericalData=preNumericalData,
else: categoryTagIndex.append(-1) for row in data: rowData = [] for index in categoryTagIndex: if index >= 0: rowData.append(row[index]) else: rowData.append(missingValue) categoryDataTmp.append(rowData) categoryData = np.array(categoryDataTmp, dtype=np.int) categoryInfo = mongoDataProcessor.getCategoryInfo() flattenCategoryData = bpModelDataPrcocessor.getPreProcessedFlattenCategoryData( (categoryHeader, categoryData), categoryInfo )[1] with open("Data/ResultFlattenCategoryData.csv", "w") as f: f.write(",".join(flattenCategoryHeader) + "\n") np.savetxt(f, flattenCategoryData, delimiter=",", fmt="%d") # Step4: get flatten numerical data numStats = dataBase.NumStats meanStdMap = {} mscollection = numStats.find({}) for item in mscollection: std = item["std"] mean = item["mean"] tagName = item["tagName"] meanStdMap[tagName] = (mean, std) numericalTagIndex = []
header=','.join(numericalHeader).encode('utf-8'), delimiter=',', fmt='%.4f', comments='') # z-score format zscoreNumericalData = preprocessing.scale(npNumericalData) np.savetxt('./Data/zscoreNumericalData.csv', zscoreNumericalData, header=','.join(numericalHeader).encode('utf-8'), delimiter=',', fmt='%.4f', comments='') # bitformat for categoryData preCategoryHeader, preCategoryData = bpModelDataPrcocessor.getPreProcessedFlattenCategoryData( (categoryHeader, npCategoryData), mongoDataProcessor.getCategoryInfo()) np.savetxt('./Data/preCategoryData.csv', preCategoryData, header=','.join(preCategoryHeader).encode('utf-8'), delimiter=',', fmt='%d', comments='') # PCA+KMeans for numericalData PCAResultMap, transResultMap = bpModelDataPrcocessor.getAssociatedMapFromPCA( (numericalHeader, zscoreNumericalData)) kmLists = bpModelDataPrcocessor.getKMeansListByCalculation( (numericalHeader, zscoreNumericalData), (PCAResultMap, transResultMap), path='./Figures/') bpModelDataPrcocessor.saveKMeansListToFile('./KMeansModel/', numericalHeader) preNumericalHeader, preNumericalData = bpModelDataPrcocessor.getPreProcessedFlattenNumericalData(