def getDataPackage(): dd = "/home/earls3/Price/AUREAPackage/src/AUREA/data/" softfile = dd+"GDS2545.soft.gz" gnfile = dd+"c2.biocarta.v2.5.symbols.gmt" synfile = dd+"Homo_sapiens.gene_info.gz" gnf = GMTParser.GMTParser(gnfile) sp = SOFTParser.SOFTParser(softfile) normal = [] tumor = [] for line in sp.column_heading_info[0]: if string.find(line[1], 'normal prostate tissue free') > 0: normal.append(line[0].strip()) elif string.find(line[1], 'tumor') > 0: tumor.append(line[0].strip()) dt = DataCleaner.DataTable() dt.getSOFTData(sp) dp = DataPackager.dataPackager() dp.addGeneNetwork(gnf.getAllNetworks()) dp.addDataTable(dt) dp.addSynonyms(synfile) dp.createClassification("Tumor") dp.createClassification("Normal") for samp in tumor: dp.addToClassification("Tumor", dt.dt_id, samp) for samp in normal[:-1]: dp.addToClassification("Normal", dt.dt_id, samp) dp.setUnclassified(dt.dt_id, normal[-1]) return dp
def buildData(file1, file2, config): """ Takes the 2 csv file names and the config object and returns the datapackage """ gnfile = "c2.biocarta.v2.5.symbols.gmt" synfile = "Homo_sapiens.gene_info.gz" collision = config.getSetting("datatable", "Gene Collision Rule")[0] bad_data = config.getSetting("datatable", "Bad Data Value")[0] gene_column = config.getSetting("datatable", "Gene Column")[0] probe_column = config.getSetting("datatable", "Probe Column")[0] gnf = GMTParser.GMTParser(gnfile) #VC: edit here #create GEO Data Getter #f1=GEODataGetter() f1 = CSVParser.CSVParser(file1, probe_column_name=probe_column, gene_column_name=gene_column) f2 = CSVParser.CSVParser(file2, probe_column_name=probe_column, gene_column_name=gene_column) #create a data table dt1 = DataCleaner.DataTable(probe_column, gene_column, collision, bad_data) dt1.getCSVData(f1) dt2 = DataCleaner.DataTable(probe_column, gene_column, collision, bad_data) dt2.getCSVData(f2) #VC: done edit dp = DataPackager.dataPackager(merge_cache=".") dp.addGeneNetwork(gnf.getAllNetworks()) dp.addSynonyms(synfile) #add data table dp.addDataTable(dt1) dp.addDataTable(dt2) #create subsets(classes) dp.createClassification("f1") for samp in f1.getDataColumnHeadings(): dp.addToClassification("f1", dt1.dt_id, samp) dp.createClassification("f2") for samp in f2.getDataColumnHeadings(): dp.addToClassification("f2", dt2.dt_id, samp) return dp