def createDataPackage(dataFile, subset1, subset2 ): """ Builds a DataPackager """ from AUREA.packager.DataPackager import dataPackager from AUREA.parser.SOFTParser import SOFTParser from AUREA.packager.DataCleaner import DataTable from AUREA.parser.GMTParser import GMTParser #get soft file sp = SOFTParser( dataFile ) ss_list = [None, None] for ss in sp.getSubsets(): if subset1 in ss.attributes['subset_description']: ss_list[0] = ss if subset2 in ss.attributes['subset_description']: ss_list[1] = ss subsetSamples1 = sp.getSubsetSamples( ss_list[0] ) subsetSamples2 = sp.getSubsetSamples( ss_list[1] ) #make a data table dt = DataTable() dt.getSOFTData( sp ) #add stuff to build networks gene_network_file = AUREA_dir +"/workspace/data/c2.biocarta.v2.5.symbols.gmt" synonym_file = AUREA_dir + "/workspace/data/Homo_sapiens.gene_info.gz" #set up classes dp = dataPackager() dp.addSynonyms(synonym_file) gn = GMTParser(gene_network_file) dp.addGeneNetwork(gn.getAllNetworks()) dp.addDataTable(dt) dp.createClassification(subset1) dp.createClassification(subset2) for sample in subsetSamples1: dp.addToClassification( subset1, dt.dt_id, sample ) for sample in subsetSamples2: dp.addToClassification( subset2, dt.dt_id, sample ) return dp
from AUREA.packager.DataCleaner import DataTable from AUREA.packager.DataPackager import dataPackager if __name__ == "__main__": path = "/home/earls3/Price/AUREA/workspace/data/" f1 = "GDS2545.soft.gz" sp = SOFTParser(path + f1) t1s = [] t2s = [] for x in sp.getSubsets(): if x.attributes['subset_description'][0] == 'normal prostate tissue': t1s = sp.getSubsetSamples(x) if x.attributes['subset_description'][0] == 'primary prostate tumor': t2s = sp.getSubsetSamples(x) dt = DataTable() dt.getSOFTData(sp) dp = dataPackager(merge_cache=".") dp.addDataTable(dt) dp.createClassification("Normal") dp.createClassification("ignore") for samp in t1s: dp.addToClassification("Normal", dt.dt_id, samp) dp.writeToCSV("normal.csv", key='probe') dp.clearClassification() dp.createClassification("Tumor") dp.createClassification("ignore") for samp in t2s: dp.addToClassification("Tumor", dt.dt_id, samp) dp.writeToCSV("tumor.csv", key='probe')