def load_soft_series_family(self, filename): # Load from soft data file for genes # SOFT files are a /sort of/ basterdized csv with data in tab-separated columns # So, we use the csv reader to get that, accounting for most stuff being single field with # slightly strange identifiers reader = csv.reader(open(filename, 'rU'), delimiter='\t', dialect='excel') soft_data = self.preprocess_soft(reader) database = {} platform = {} samples = {} sample_data = {} for section, rows in list(soft_data.items()): if section.startswith('^DATABASE'): database = self.get_soft_metadata(rows) elif section.startswith('^PLATFORM'): platform = self.get_soft_metadata(rows) platform_data = self.get_soft_data(rows, '!platform_table_begin', '!platform_table_end') elif section.startswith('^SAMPLE'): key, sample_id = row[0].split(' = ') samples[sample_id] = self.get_soft_metadata(rows) sample_data[sample_id] = self.get_soft_data(rows, '!sample_table_begin', '!sample_table_end') # We now have the entire dataseries loaded; but in a bit of a messed up format # Build a dataset object to fit and map the data in xdim = len(platform_data) # Use first sample to access the gene list ydim = len(sample_data) # Build dataset object dso = DataSet(size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) sample_ids = sorted(samples.keys()) # Get the samples sorted so we keep everything lined up gene_ids = sorted(platform_data.keys()) # Get the keys sorted so we keep everything lined up dso.labels[0] = sample_ids dso.labels[1] = [platform_data[gene_id]['UNIGENE'] for gene_id in gene_ids] dso.entities[1] = [self.m.db.get_via_unification('UNIGENE', gene_id) for gene_id in dso.labels[1]] for xn, gene_id in enumerate(gene_ids): for yn, sample_id in enumerate(sample_ids): dso.data[yn, xn] = sample_data[sample_id][gene_id]['VALUE'] return dso
def load_datafile(self, filename): # Determine if we've got a csv or peakml file (extension) #self.data.o['output'].empty() dso = DataSet() # Read data in from peakml format file xml = et.parse(filename) # Get sample ids, names and class groupings sets = xml.iterfind('header/sets/set') midclass = {} classes = set() measurements = [] masses = {} for aset in sets: id = aset.find('id').text mids = aset.find('measurementids').text for mid in self.decode(mids): midclass[mid] = id measurements.append(mid) classes.add(id) # We have all the sample data now, parse the intensity and identity info peaksets = xml.iterfind('peaks/peak') quantities = defaultdict(dict) all_identities = [] for peakset in peaksets: # Find metabolite identities annotations = peakset.iterfind('annotations/annotation') identities = False for annotation in annotations: if annotation.find('label').text == 'identification': identities = annotation.find('value').text.split(', ') all_identities.extend(identities) break if identities: # PeakML supports multiple alternative metabolite identities,currently we don't so duplicate # We have identities, now get intensities for the different samples chromatograms = peakset.iterfind( 'peaks/peak') # Next level down for chromatogram in chromatograms: mid = chromatogram.find('measurementid').text intensity = float(chromatogram.find('intensity').text) mass = float(chromatogram.find('mass').text) # Write out to each of the identities table (need to buffer til we have the entire list) for identity in identities: quantities[mid][identity] = intensity # Write out to each of the identities table (need to buffer til we have the entire list) for identity in identities: masses[identity] = mass # Sort the identities/masses into consecutive order # Quantities table built; class table built; now rearrange into dso dso.empty((len(measurements), len(all_identities))) dso.labels[0] = measurements dso.classes[0] = [midclass[mid] for mid in measurements] dso.labels[1] = all_identities db_hmdbids = self.m.db.unification['HMDB'] dso.entities[1] = [ db_hmdbids[hmdbid] if hmdbid in db_hmdbids else None for hmdbid in all_identities ] dso.scales[1] = [float(masses[i]) for i in all_identities] for mid, identities in list(quantities.items()): for identity, intensity in list(identities.items()): r = measurements.index(mid) c = all_identities.index(identity) dso.data[r, c] = intensity dso.name = os.path.basename(filename) dso.description = 'Imported PeakML file' self.set_name(dso.name) return {'output': dso}
def load_soft_dataset(self, filename): # Load from soft data file for genes # SOFT files are a /sort of/ basterdized csv with data in tab-separated columns # So, we use the csv reader to get that, accounting for most stuff being single field with # slightly strange identifiers f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter='\t', dialect='excel') soft_data = self.preprocess_soft(reader, f=f, fsize=fsize) # soft_data now contains lists of sections with ^ markers database = {} dataset = {} dataset_data = {} subsets = {} for section, rows in list(soft_data.items()): if section.startswith('^DATABASE'): database = self.get_soft_metadata(rows) elif section.startswith('^DATASET'): dataset.update(self.get_soft_metadata(rows)) # update because seems can be >1 entry to dataset data = self.get_soft_data(rows, '!dataset_table_begin', '!dataset_table_end') dataset_data = data elif section.startswith('^SUBSET'): key, subset_id = section.split(' = ') subsets[subset_id] = self.get_soft_metadata(rows) subsets[subset_id]['subset_sample_id'] = subsets[subset_id]['subset_sample_id'].split(',') # Turn to list of ids # We now have the entire dataset loaded; but in a bit of a messed up format # Build a dataset object to fit and map the data in sample_ids = [] for k, subset in list(subsets.items()): sample_ids.extend(subset['subset_sample_id']) sample_ids = sorted(sample_ids) # Get the samples sorted so we keep everything lined up class_lookup = {} for class_id, s in list(subsets.items()): for s_id in s['subset_sample_id']: class_lookup[s_id] = "%s (%s)" % (s['subset_description'] if 'subset_description' in s else '', class_id) xdim = len(dataset_data) # Use first sample to access the gene list ydim = len(sample_ids) # Build dataset object dso = DataSet(size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) gene_ids = sorted(dataset_data.keys()) # Get the keys sorted so we keep everything lined up dso.labels[0] = sample_ids dso.classes[0] = [class_lookup[s_id] for s_id in sample_ids] dso.labels[1] = [dataset_data[gene_id]['IDENTIFIER'] for gene_id in gene_ids] dso.entities[1] = [self.m.db.get_via_synonym(gene_id) for gene_id in dso.labels[1]] for xn, gene_id in enumerate(gene_ids): for yn, sample_id in enumerate(sample_ids): dso.data[yn, xn] = dataset_data[gene_id][sample_id] return dso
def load_csv_C( self, filename ): # Load from csv with experiments in COLUMNS, metabolites in ROWS # Read in data for the graphing metabolite, with associated value (generate mean) f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter=str(','), dialect='excel') hrow = next(reader) # Discard top row (sample no's) samples = hrow[1:] hrow = next(reader) # Get 2nd row classesa = hrow[1:] classes = [c for c in classesa if c != '.'] metabolites = [] data = [] added_rows = 0 for n, row in enumerate(reader): metabolite = row[0] metabolites.append(row[0]) quants = [] for cn, c in enumerate(row[1:]): if classesa[cn] != '.': try: data.append(float(c)) except: data.append(0) if n % 100 == 0: self.progress.emit(float(f.tell()) / fsize) data = np.asarray(data) data = np.reshape(data, (n + 1, len(classes))).T xdim = len(quants) ydim = len(classes) # Build dataset object dso = DataSet( size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) dso.labels[1] = metabolites scales = [] mlabels = [] for m in metabolites: try: scales.append(float(m)) mlabels.append(None) except: scales.append(None) mlabels.append(m) dso.scales[0] = [None] * len(samples) dso.labels[0] = samples dso.classes[0] = classes dso.entities[0] = [None] * len(samples) dso.scales[1] = scales dso.labels[1] = mlabels dso.classes[1] = [None] * len(scales) dso.entities[1] = [None] * len(scales) dso.data = data return dso
def load_csv_R( self, filename ): # Load from csv with experiments in ROWS, metabolites in COLUMNS # Read in data for the graphing metabolite, with associated value (generate mean) f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter=str(','), dialect='excel') print('R') hrow = next(reader) # Get top row metabolites = hrow[2:] ydim = 0 xdim = len(metabolites) samples = [] classes = [] raw_data = [] # Build quants table for metabolite classes #for metabolite in self.metabolites: # quantities[ metabolite ] = defaultdict(list) for n, row in enumerate(reader): ydim += 1 if row[1] != '.': # Skip excluded classes # row[1] = Class samples.append(row[0]) classes.append(row[1]) data_row = [] for c in row[2:]: # in self.metabolites: try: c = float(c) except: c = 0 data_row.append(c) raw_data.append(data_row) #metabolite_column = hrow.index( metabolite ) #if row[ metabolite_column ]: # data_row.append( # quantities[metabolite][ row[1] ].append( float(row[ metabolite_column ]) ) #self.statistics['ymin'] = min( self.statistics['ymin'], float(row[ metabolite_column ]) ) #self.statistics['ymax'] = max( self.statistics['ymax'], float(row[ metabolite_column ]) ) #else: # quantities[metabolite][ row[1] ].append( 0 ) else: pass if n % 100 == 0: self.progress.emit(float(f.tell()) / fsize) #self.statistics['excluded'] += 1 # Build dataset object dso = DataSet( size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) #dso.labels[1] = metabolites scales = [] mlabels = [] for m in metabolites: try: scales.append(float(m)) mlabels.append(None) except: scales.append(None) mlabels.append(m) dso.scales[1] = [None] * len(samples) dso.labels[0] = samples dso.classes[0] = classes dso.entities[0] = [None] * len(samples) dso.scales[1] = scales dso.labels[1] = mlabels dso.entities[1] = [None] * len(scales) dso.classes[1] = [None] * len(scales) dso.data = np.array(raw_data) return dso
def load_datafile(self, filename): # Determine if we've got a csv or peakml file (extension) #self.data.o['output'].empty() dso = DataSet() # Read data in from peakml format file xml = et.parse(filename) # Get sample ids, names and class groupings sets = xml.iterfind('header/sets/set') midclass = {} classes = set() measurements = [] masses = {} for aset in sets: id = aset.find('id').text mids = aset.find('measurementids').text for mid in self.decode(mids): midclass[mid] = id measurements.append(mid) classes.add(id) # We have all the sample data now, parse the intensity and identity info peaksets = xml.iterfind('peaks/peak') quantities = defaultdict(dict) all_identities = [] for peakset in peaksets: # Find metabolite identities annotations = peakset.iterfind('annotations/annotation') identities = False for annotation in annotations: if annotation.find('label').text == 'identification': identities = annotation.find('value').text.split(', ') all_identities.extend(identities) break if identities: # PeakML supports multiple alternative metabolite identities,currently we don't so duplicate # We have identities, now get intensities for the different samples chromatograms = peakset.iterfind('peaks/peak') # Next level down for chromatogram in chromatograms: mid = chromatogram.find('measurementid').text intensity = float(chromatogram.find('intensity').text) mass = float(chromatogram.find('mass').text) # Write out to each of the identities table (need to buffer til we have the entire list) for identity in identities: quantities[mid][identity] = intensity # Write out to each of the identities table (need to buffer til we have the entire list) for identity in identities: masses[identity] = mass # Sort the identities/masses into consecutive order # Quantities table built; class table built; now rearrange into dso dso.empty((len(measurements), len(all_identities))) dso.labels[0] = measurements dso.classes[0] = [midclass[mid] for mid in measurements] dso.labels[1] = all_identities db_hmdbids = self.m.db.unification['HMDB'] dso.entities[1] = [db_hmdbids[hmdbid] if hmdbid in db_hmdbids else None for hmdbid in all_identities] dso.scales[1] = [float(masses[i]) for i in all_identities] for mid, identities in list(quantities.items()): for identity, intensity in list(identities.items()): r = measurements.index(mid) c = all_identities.index(identity) dso.data[r, c] = intensity dso.name = os.path.basename(filename) dso.description = 'Imported PeakML file' self.change_name.emit(dso.name) return {'output': dso}
def load_csv_C(self, filename): # Load from csv with experiments in COLUMNS, metabolites in ROWS # Read in data for the graphing metabolite, with associated value (generate mean) f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter=str(','), dialect='excel') hrow = next(reader) # Discard top row (sample no's) samples = hrow[1:] hrow = next(reader) # Get 2nd row classesa = hrow[1:] classes = [c for c in classesa if c != '.'] metabolites = [] data = [] added_rows = 0 for n, row in enumerate(reader): metabolite = row[0] metabolites.append(row[0]) quants = [] for cn, c in enumerate(row[1:]): if classesa[cn] != '.': try: data.append(float(c)) except: data.append(0) if n % 100 == 0: self.progress.emit(float(f.tell()) / fsize) data = np.asarray(data) data = np.reshape(data, (n + 1, len(classes))).T xdim = len(quants) ydim = len(classes) # Build dataset object dso = DataSet(size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) dso.labels[1] = metabolites scales = [] mlabels = [] for m in metabolites: try: scales.append(float(m)) mlabels.append(None) except: scales.append(None) mlabels.append(m) dso.scales[0] = [None] * len(samples) dso.labels[0] = samples dso.classes[0] = classes dso.entities[0] = [None] * len(samples) dso.scales[1] = scales dso.labels[1] = mlabels dso.classes[1] = [None] * len(scales) dso.entities[1] = [None] * len(scales) dso.data = data return dso
def load_csv_R(self, filename): # Load from csv with experiments in ROWS, metabolites in COLUMNS # Read in data for the graphing metabolite, with associated value (generate mean) f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter=str(','), dialect='excel') print('R') hrow = next(reader) # Get top row metabolites = hrow[2:] ydim = 0 xdim = len(metabolites) samples = [] classes = [] raw_data = [] # Build quants table for metabolite classes #for metabolite in self.metabolites: # quantities[ metabolite ] = defaultdict(list) for n, row in enumerate(reader): ydim += 1 if row[1] != '.': # Skip excluded classes # row[1] = Class samples.append(row[0]) classes.append(row[1]) data_row = [] for c in row[2:]: # in self.metabolites: try: c = float(c) except: c = 0 data_row.append(c) raw_data.append(data_row) #metabolite_column = hrow.index( metabolite ) #if row[ metabolite_column ]: # data_row.append( # quantities[metabolite][ row[1] ].append( float(row[ metabolite_column ]) ) #self.statistics['ymin'] = min( self.statistics['ymin'], float(row[ metabolite_column ]) ) #self.statistics['ymax'] = max( self.statistics['ymax'], float(row[ metabolite_column ]) ) #else: # quantities[metabolite][ row[1] ].append( 0 ) else: pass if n % 100 == 0: self.progress.emit(float(f.tell()) / fsize) #self.statistics['excluded'] += 1 # Build dataset object dso = DataSet(size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) #dso.labels[1] = metabolites scales = [] mlabels = [] for m in metabolites: try: scales.append(float(m)) mlabels.append(None) except: scales.append(None) mlabels.append(m) dso.scales[1] = [None] * len(samples) dso.labels[0] = samples dso.classes[0] = classes dso.entities[0] = [None] * len(samples) dso.scales[1] = scales dso.labels[1] = mlabels dso.entities[1] = [None] * len(scales) dso.classes[1] = [None] * len(scales) dso.data = np.array(raw_data) return dso