def mainWork(path,savePath): pbar = tqdm.tqdm(os.listdir(path), desc='dirs') for file in pbar: pbar.set_description("Processing %s" % file) if file.endswith('xlsx') or file.endswith('csv'): originData = readAllTypeFile(os.path.join(path, file)) for missPattern in ['normal']: result = {} for missRate in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]: if missPattern == 'normal': missData = gene_missingdata(rate=missRate, data=originData) elif missPattern == 'taxa': missData = gene_missingdata_taxa_bias(rate=missRate, data=originData) elif missPattern == 'chara': missData = gene_missingdata_chara_bias(rate=missRate, data=originData) elif missPattern == 'block': missData = gene_missingdata_block_bias(rate=missRate, data=originData) else: raise Exception("缺失模式错误,请在'normal','taxa','chara','block'中选择对应模式") result, MICEImputedData = MICE(result, originData, missData, missRate, missPattern) for firstImputedMethod in ['mice']: if firstImputedMethod=='mice': firstImputedData = MICEImputedData for loss in ['MSELoss']: #for autoMethod in ['Autoencoder','ResAutoencoder','StockedAutoencoder','StockedResAutoencoder']: for autoMethod in ['StockedResAutoencoder']: start=time.time() result,_=TAI(result=result,firstImputedMethod=firstImputedMethod, firstImputedData=firstImputedData, loss=loss,autoMethod=autoMethod, originData=originData,missData=missData, missRate=missRate,missPattern=missPattern, ) logger.info("{}-{}-{}训练耗时:{}".format(firstImputedMethod,loss,autoMethod,time.time() - start)) if not os.path.exists(savePath): os.makedirs(savePath) saveJson(result, os.path.join(savePath,"{}_{}_{}_{}.json".format("allMethod", missPattern,file, datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))))
for first_imputed_method in ['ii', 'mice']: for loss in ['MSELoss']: for method in [ 'Autoencoder', 'ResAutoencoder', 'StockedAutoencoder', 'StockedResAutoencoder' ]: varname = "{}_{}_{}".format(first_imputed_method, loss, method) globals()[varname] = [[] for _ in range(4)] methed_names_half.append(varname) methed_names_all.append(varname) # for i in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: for i in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]: if missPattern == 'normal': missData = gene_missingdata(rate=i, data=originData) elif missPattern == 'taxa': missData = gene_missingdata_taxa_bias(rate=i, data=originData) elif missPattern == 'chara': missData = gene_missingdata_chara_bias(rate=i, data=originData) elif missPattern == 'block': missData = gene_missingdata_block_bias(rate=i, data=originData) else: raise Exception( "缺失模式错误,请在'normal','taxa','chara','block'中选择对应模式") mark = [ temp[0] for temp in pd.DataFrame(np.unique(missData)).dropna( axis=0).values ]
def mainWork(path, savePath): for file in os.listdir(path): originData = readAllTypeFile(os.path.join(path, file)) for missPattern in ['normal']: # for missPattern in ['normal','block', 'taxa', 'chara']: # for i in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: result = {} for missRate in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]: if missPattern == 'normal': missData = gene_missingdata(rate=missRate, data=originData) elif missPattern == 'taxa': missData = gene_missingdata_taxa_bias(rate=missRate, data=originData) elif missPattern == 'chara': missData = gene_missingdata_chara_bias(rate=missRate, data=originData) elif missPattern == 'block': missData = gene_missingdata_block_bias(rate=missRate, data=originData) else: raise Exception( "缺失模式错误,请在'normal','taxa','chara','block'中选择对应模式") result, _ = Random(result, originData, missData, missRate, missPattern, 'disperse') result, _ = Medain(result, originData, missData, missRate, missPattern, 'disperse') result, KNNImputedData = KNN(result, originData, missData, missRate, missPattern, 'disperse') result, EMImputedData = EM(result, originData, missData, missRate, missPattern, 'disperse') result, IIImputedData = II(result, originData, missData, missRate, missPattern, 'disperse') result, _ = GAIN(result, originData, missData, missRate, missPattern, 'disperse') result, _ = MIDA(result, originData, missData, missRate, missPattern, 'disperse') result, MICEImputedData = MICE(result, originData, missData, missRate, missPattern, 'disperse') # for firstImputedMethod in ['ii', 'mice']: for firstImputedMethod in ['knn', 'ii', 'mice']: if firstImputedMethod == 'knn': firstImputedData = KNNImputedData elif firstImputedMethod == 'ii': firstImputedData = IIImputedData elif firstImputedMethod == 'mice': firstImputedData = MICEImputedData for loss in ['MSELoss']: # for autoMethod in ['Autoencoder','ResAutoencoder','StockedAutoencoder','StockedResAutoencoder']: for autoMethod in ['Autoencoder']: start = time.time() result = TAI( result=result, firstImputedMethod=firstImputedMethod, firstImputedData=firstImputedData.copy(), loss=loss, autoMethod=autoMethod, originData=originData, missData=missData, missRate=missRate, missPattern=missPattern) logger.info("改后{}-{}-{}训练耗时:{}".format( firstImputedMethod, loss, autoMethod, time.time() - start)) saveJson( result, os.path.join( savePath, "{}_{}_{}_{}.json".format( "allmethod", missPattern, file, datetime.datetime.now().strftime('%Y%m%d-%H%M%S'))))
#simDataPath=r'C:\Users\pro\Desktop\实验相关文档\缺失插补建树全流程\03无缺失随机缺失模拟数据' #生成模拟数据集 # for file in os.listdir(originDataPath): # data,missRow,speciesName,begin,end=readNex(os.path.join(originDataPath,file)) # noMissingData = impyute.imputation.cs.random(data) # saveData(noMissingDataPath,file,speciesName,noMissingData,begin,end) imputedDataPath = r'C:\Users\pro\Desktop\实验相关文档\缺失插补建树全流程\04缺失插补结果' for file in tqdm.tqdm(os.listdir(noMissingDataPath)): originData, missRow, speciesName, begin, end = readNex( os.path.join(noMissingDataPath, file)) for missPattern in ['normal']: result = {} for missRate in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]: if missPattern == 'normal': missData = gene_missingdata(rate=missRate, data=originData) elif missPattern == 'taxa': missData = gene_missingdata_taxa_bias(rate=missRate, data=originData) elif missPattern == 'chara': missData = gene_missingdata_chara_bias(rate=missRate, data=originData) elif missPattern == 'block': missData = gene_missingdata_block_bias(rate=missRate, data=originData) else: raise Exception( "缺失模式错误,请在'normal','taxa','chara','block'中选择对应模式") # saveData(simDataPath, "{}_{}".format(str(missRate),file), speciesName, missData, begin, end) missData = lableEncoder(missData)
# fileSavePath=r'C:\Users\ASUS\Desktop\usefulDataSimple\fixed' filePath = r'G:\labWork\imputed_experiment_data' modelSavePath = r'G:\labWork\imputed_experiment_data\model' fileSavePath = r'G:\labWork\imputed_experiment_data\fix' for file in os.listdir(filePath): if file.endswith('tnt'): file = file[:-4] for i in [0.1, 0.2, 0.4, 0.5]: try: # file='02Bennett94pterosaurs' # file='Liu2011' # originData,miss_mask,speciesName=readNex(r'C:\Users\pro\Desktop\all_nex_data\{}.nex'.format(file)) originData, miss_mask, speciesName, begin, end = readNex( os.path.join(filePath, '{}.tnt'.format(file))) missData, miss_mask = gene_missingdata(rate=i, data=originData) try: min_max_scaler = preprocessing.MinMaxScaler() data = min_max_scaler.fit_transform(missData) miss_location = get_miss_location(data[miss_mask]) modelName = file + str(i) inp = interpolation(modelName=modelName, completeData=np.delete(data, miss_mask, axis=0)) if not os.path.exists( os.path.join(modelSavePath, '{}.pkl'.format(modelName))): inp.fit( os.path.join(modelSavePath,
# #归一化,去掉标签 file = r'public_data/1_Iris.xlsx' fileSavePath = r'G:\labWork\imputed_experiment_data\fix' # file='AhyongOM04crabs' modelSavePath = r'G:\labWork\imputed_experiment_data\model' logger.info("**********************{}********************".format(file)) data = pd.read_excel(file, sheet_name="dataset") dt = np.array(data.values) data = dt.astype('float') data = data[:-1] target = data[-1] for i in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: missData = gene_missingdata(rate=i, data=data) mask = get_mask(missData) miss_location = get_miss_location(missData) # 数据均值化 # min_max_scaler = preprocessing.MinMaxScaler() # mm_missData = min_max_scaler.fit_transform(missData) min_max_scaler = preprocessing.StandardScaler() mm_missData = min_max_scaler.fit_transform(missData) modelName = file + str(i) inp = interpolation_mask(modelName=modelName, completeData=random_inpute(mm_missData)) if not os.path.exists( os.path.join(modelSavePath, '{}.pkl'.format(modelName))): inp.fit(os.path.join(modelSavePath, '{}.pkl'.format(modelName)), mask)