def generate(self, input=None): data = input.data vars = self.config.get('variables') correlations = {} for n, v in enumerate(self.config.get('variables')): a, b = v x = input.data[:, a ] y = input.data[:, b ] fit = np.polyfit(x,y,1) dso = DataSet( size=(len(x),2 ) ) dso.data[:,0] = x dso.data[:,1] = y dso.labels[1][0] = make_label_for_entry( [input.scales[1][a], input.labels[1][a], input.entities[1][a] ] ) dso.labels[1][1] = make_label_for_entry( [input.scales[1][b], input.labels[1][b], input.entities[1][b] ] ) slope, intercept, r_value, p_value, std_err = sp.stats.linregress(x, y) correlations[str(n+1)] = { 'dso': dso, 'fit': fit, 'label': 'r²=%0.2f, p=%0.2f' % (r_value**2, p_value) } return {'correlations':correlations}
def normalise(self, dsi): # Generate bin values for range start_scale to end_scale # Calculate the number of bins at binsize across range dso = DataSet(size=dsi.shape) dso.import_data(dsi) dso.data = self.algorithms[self.config.get('algorithm')](dso.data) # -- optionally use the line widths and take max within each of these for each spectra (peak shiftiness) # Filter the original data with those locations and output\ return dso
def generate(self, input=None): #, config, algorithms): # Generate bin values for range start_scale to end_scale # Calculate the number of bins at binsize across range dso = DataSet(size=input.shape) dso.import_data(input) #ng.analysis.peakpick.pick(data, thres, msep=None, direction='both', algorithm='thres', est_params=True, lineshapes=None) threshold = self.config.get('peak_threshold') algorithm = self.algorithms[self.config.get('algorithm')] msep = (self.config.get('peak_separation'), ) # Take input dataset and flatten in first dimension (average spectra) data_avg = np.mean(input.data, axis=0) # pick peaks and return locations; #nmrglue.analysis.peakpick.pick(data, pthres, nthres=None, msep=None, algorithm='connected', est_params=True, lineshapes=None, edge=None, diag=False, c_struc=None, c_ndil=0, cluster=True, table=True, axis_names=['A', 'Z', 'Y', 'X'])[source]¶ locations, scales, amps = ng.analysis.peakpick.pick( data_avg, threshold, msep=msep, algorithm=algorithm, est_params=True, cluster=False, table=False) #n_cluster = max( cluster_ids ) n_locations = len(locations) new_shape = list(input.shape) new_shape[1] = n_locations # correct number; tho will be zero indexed # Convert to numpy arrays so we can do clever things scales = [dso.scales[1][l[0]] for l in locations] # Adjust the scales (so aren't lost in crop) dso.labels[1] = [str(l) for l in scales] dso.scales[1] = scales dso.crop(new_shape) # Iterate over the clusters (1 to n) for n, l in enumerate(locations): #l = locations[ cluster_ids == n ] #peak_data = np.amax( peak_data, axis=1 ) # max across cols dso.data[:, n - 1] = input.data[:, l[0]] # FIXME: # Extract the location numbers (positions in original spectra) # Get max value in each row for those regions # Append that to n position in new dataset # -- optionally use the line widths and take max within each of these for each spectra (peak shiftiness) # Filter the original data with those locations and output\ return {'output': dso}
def load_metabolights(self, filename, id_col=0, name_col=4, data_col=18): # Load from csv with experiments in COLUMNS, metabolites in ROWS print("Loading Metabolights...") #sample 1 2 3 4 #class ADG10003u_007 ADG10003u_008 ADG10003u_009 ADG10003u_010 ADG19007u_192 #2-oxoisovalerate 0.3841 0.44603 0.45971 0.40812 reader = csv.reader( open( filename, 'rU'), delimiter=',', dialect='excel') # Sample identities from top row ( sample labels ) hrow = next(reader) sample_ids = hrow[1:] # Sample classes from second row; crop off after u_ classes = [c for c in hrow if 'u_' in c] data_starts_at = hrow.index(classes[0]) metabolite_names_at = hrow.index('metabolite_identification') classes = [ c.split('u_')[0] for c in classes] metabolites = [] metabolite_data = [] # Read in metabolite data n.b. can have >1 entry / metabolite so need to allow for this for row in reader: if row[0] != '': # Skip empty rows metabolites.append( row[metabolite_names_at] ) metabolite_data.append( row[data_starts_at:] ) ydim = len( classes ) xdim = len( metabolites ) dso = DataSet( size=(ydim, xdim) ) dso.labels[0] = sample_ids dso.classes[0] = classes dso.labels[1] = metabolites for n,md in enumerate(metabolite_data): print md dso.data[:,n] = np.array(md) return dso
def generate(self, input=None): data = input.data pca = PCA(n_components=self.config.get('number_of_components')) pca.fit(data.T) # Transpose it, as vars need to along the top weights = pca.transform(data.T) # Get weights? # Label up the top 50 (the values are retained; just for clarity) wmx = np.amax( np.absolute( weights), axis=1 ) dso_z = list(zip( input.scales[1], input.entities[1], input.labels[1] )) dso_z = sorted( zip( dso_z, wmx ), key=lambda x: x[1])[-50:] # Top 50 dso_z = [x for x, wmx in dso_z ] # Build scores into a dso no_of_samples x no_of_principal_components scored = DataSet(size=(len(pca.components_[0]),len(pca.components_))) scored.labels[0] = input.labels[0] scored.classes[0] = input.classes[0] for n,s in enumerate(pca.components_): scored.data[:,n] = s scored.labels[1][n] = 'Principal Component %d (%0.2f%%)' % (n+1, pca.explained_variance_ratio_[0] * 100.) dso_pc = {} for n in range(0, weights.shape[1] ): pcd = DataSet( size=(1, input.shape[1] ) ) pcd.entities[1] = input.entities[1] pcd.labels[1] = input.labels[1] pcd.scales[1] = input.scales[1] pcd.data = weights[:,n:n+1].T dso_pc['pc%s' % (n+1)] = pcd return dict( list({ 'dso': input, 'pca': pca, 'scores': scored, #'weights': weights, 'wmx': wmx, 'dso_z': dso_z, }.items()) + list(dso_pc.items()) )
def load_soft_series_family(self, filename): # Load from soft data file for genes # SOFT files are a /sort of/ basterdized csv with data in tab-separated columns # So, we use the csv reader to get that, accounting for most stuff being single field with # slightly strange identifiers reader = csv.reader(open(filename, 'rU'), delimiter='\t', dialect='excel') soft_data = self.preprocess_soft(reader) database = {} platform = {} samples = {} sample_data = {} for section, rows in list(soft_data.items()): if section.startswith('^DATABASE'): database = self.get_soft_metadata(rows) elif section.startswith('^PLATFORM'): platform = self.get_soft_metadata(rows) platform_data = self.get_soft_data(rows, '!platform_table_begin', '!platform_table_end') elif section.startswith('^SAMPLE'): key, sample_id = row[0].split(' = ') samples[sample_id] = self.get_soft_metadata(rows) sample_data[sample_id] = self.get_soft_data(rows, '!sample_table_begin', '!sample_table_end') # We now have the entire dataseries loaded; but in a bit of a messed up format # Build a dataset object to fit and map the data in xdim = len(platform_data) # Use first sample to access the gene list ydim = len(sample_data) # Build dataset object dso = DataSet(size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) sample_ids = sorted(samples.keys()) # Get the samples sorted so we keep everything lined up gene_ids = sorted(platform_data.keys()) # Get the keys sorted so we keep everything lined up dso.labels[0] = sample_ids dso.labels[1] = [platform_data[gene_id]['UNIGENE'] for gene_id in gene_ids] dso.entities[1] = [self.m.db.get_via_unification('UNIGENE', gene_id) for gene_id in dso.labels[1]] for xn, gene_id in enumerate(gene_ids): for yn, sample_id in enumerate(sample_ids): dso.data[yn, xn] = sample_data[sample_id][gene_id]['VALUE'] return dso
def generate(self, input=None): #, config, algorithms): # Generate bin values for range start_scale to end_scale # Calculate the number of bins at binsize across range dso = DataSet( size=input.shape ) dso.import_data(input) #ng.analysis.peakpick.pick(data, thres, msep=None, direction='both', algorithm='thres', est_params=True, lineshapes=None) threshold = self.config.get('peak_threshold') algorithm = self.algorithms[ self.config.get('algorithm')] msep = ( self.config.get('peak_separation'),) # Take input dataset and flatten in first dimension (average spectra) data_avg = np.mean( input.data, axis=0) # pick peaks and return locations; #nmrglue.analysis.peakpick.pick(data, pthres, nthres=None, msep=None, algorithm='connected', est_params=True, lineshapes=None, edge=None, diag=False, c_struc=None, c_ndil=0, cluster=True, table=True, axis_names=['A', 'Z', 'Y', 'X'])[source]¶ locations, scales, amps = ng.analysis.peakpick.pick(data_avg, threshold, msep=msep, algorithm=algorithm, est_params = True, cluster=False, table=False) #n_cluster = max( cluster_ids ) n_locations = len( locations ) new_shape = list( input.shape ) new_shape[1] = n_locations # correct number; tho will be zero indexed # Convert to numpy arrays so we can do clever things scales = [dso.scales[1][l[0]] for l in locations ] # Adjust the scales (so aren't lost in crop) dso.labels[1] = [ str(l) for l in scales] dso.scales[1] = scales dso.crop( new_shape ) # Iterate over the clusters (1 to n) for n, l in enumerate(locations): #l = locations[ cluster_ids == n ] #peak_data = np.amax( peak_data, axis=1 ) # max across cols dso.data[:,n-1] = input.data[:, l[0]] # FIXME: # Extract the location numbers (positions in original spectra) # Get max value in each row for those regions # Append that to n position in new dataset # -- optionally use the line widths and take max within each of these for each spectra (peak shiftiness) # Filter the original data with those locations and output\ return {'output':dso}
def load_datafile(self, file): reader = csv.reader(open(file, 'rU'), delimiter='\t', dialect='excel') hrow = next(reader) # Get top row slabels = [] data = [] if hrow[0] == 'Profiled Data Type': # Is a Chenomx output file; use the other columns to map data scale/etc. once implemented next(reader) # Skip date row hrow = next(reader) labels = hrow[2:] # We strip off the pH here; might be nice to keep it entities = [self.m.db.synrev[l] if l in self.m.db.synrev else None for l in labels] # Map to entities if they exist next(reader) # Skip compound ID next(reader) # Skip InChI next(reader) # Skip SMILES for hrow in reader: # Now read the data rows slabels.append(hrow[0]) td = [] for x in hrow[2:]: try: td.append(float(x)) except: td.append(0) data.append(td) data = np.array(data) dso = DataSet(size=data.shape) print(data.shape) dso.labels[1] = labels dso.entities[1] = entities dso.labels[0] = slabels dso.data = data return {'output': dso}
def generate(self, input=None): dsi = input ###### BINNING USING CONFI # Generate bin values for range start_scale to end_scale # Calculate the number of bins at binsize across range dso = DataSet() dso.import_data(dsi) r = dsi.scales_r[1] self._bin_size, self._bin_offset = self.config.get('bin_size'), self.config.get('bin_offset') bins = np.arange(r[0] + self._bin_offset, r[1] + self._bin_offset, self._bin_size) number_of_bins = len(bins) - 1 # Can't increase the size of data, if bins > current size return the original if number_of_bins >= len(dso.scales[1]): return {'dso': dso} # Resize (lossy) to the new shape old_shape, new_shape = list(dsi.data.shape), list(dso.data.shape) new_shape[1] = number_of_bins dso.crop(new_shape) # Lossy crop, but we'll be within the boundary below for n, d in enumerate(dsi.data): binned_data = np.histogram(dsi.scales[1], bins=bins, weights=d) binned_num = np.histogram(dsi.scales[1], bins=bins) # Number of data points that ended up contributing to each bin dso.data[n, :] = binned_data[0] / binned_num[0] # Mean dso.scales[1] = [float(x) for x in binned_data[1][:-1]] dso.labels[1] = [str(x) for x in binned_data[1][:-1]] # Remove any NaNs that have crept in (due to the histogram) dso.remove_invalid_data() return {'output': dso, 'input': input} # Pass back input for difference plot
def generate(self, input=None): pathways = [k for k, v in db.dbm.get_pathways()] pathway_compounds = dict() for k, p in db.dbm.get_pathways(): pathway_compounds[p.id] = set([m for m in p.compounds]) data_m, labels_m = self.build_matrix(pathways, pathway_compounds) pathway_reactions = dict() for k, p in list(db.dbm.pathways.items()): pathway_reactions[p.id] = set([m for m in p.reactions]) data_r, labels_r = self.build_matrix(pathways, pathway_reactions) pathway_active_reactions = dict() pathway_active_compounds = dict() active_pathways = input.entities[1] active_pathways_id = [] for p in active_pathways: pathway_active_reactions[p.id] = set([r for r in p.reactions]) pathway_active_compounds[p.id] = set([r for r in p.compounds]) active_pathways_id.append(p.id) data_ar, labels_ar = self.build_matrix(active_pathways_id, pathway_active_reactions) data_am, labels_am = self.build_matrix(active_pathways_id, pathway_active_compounds) dim = len(data_ar) dso_r = DataSet(size=(dim, dim)) dso_r.data = data_ar dso_r.labels[1] = labels_ar dso_m = DataSet(size=(dim, dim)) dso_m.data = data_am dso_m.labels[1] = labels_am return {'dso_r': dso_r, 'dso_m': dso_m}
def generate(self, input=None): pathways = list(self.m.db.pathways.keys()) pathway_compounds = dict() for k, p in list(self.m.db.pathways.items()): pathway_compounds[p.id] = set([m for m in p.compounds]) data_m, labels_m = self.build_matrix(pathways, pathway_compounds) pathway_reactions = dict() for k, p in list(self.m.db.pathways.items()): pathway_reactions[p.id] = set([m for m in p.reactions]) data_r, labels_r = self.build_matrix(pathways, pathway_reactions) pathway_active_reactions = dict() pathway_active_compounds = dict() active_pathways = input.entities[1] # [self.parent.db.pathways[p] for p in self.parent.config.value('/Pathways/Show').split(',')] active_pathways_id = [] for p in active_pathways: pathway_active_reactions[p.id] = set([r for r in p.reactions]) pathway_active_compounds[p.id] = set([r for r in p.compounds]) active_pathways_id.append(p.id) data_ar, labels_ar = self.build_matrix(active_pathways_id, pathway_active_reactions) data_am, labels_am = self.build_matrix(active_pathways_id, pathway_active_compounds) dim = len(data_ar) dso_r = DataSet(size=(dim, dim)) dso_r.data = data_ar dso_r.labels[1] = labels_ar dso_m = DataSet(size=(dim, dim)) dso_m.data = data_am dso_m.labels[1] = labels_am return {'dso_r': dso_r, 'dso_m': dso_m}
def load_bml_datafile(self, data_path, target, name): dso = DataSet() # Read in data for the graphing metabolite, with associated value (generate mean) reader = csv.reader(utils.nonull(open(data_path, 'rb')), delimiter='\t', dialect='excel') for row in reader: if row and row[0] == 'metabolite': # Look for the top row break else: return samples = row[1:-2] # Sample identities samples = [sample[8:-1] for sample in samples] xdim = 0 ydim = len(samples) raw_data = [] metabolites = [] for row in reader: xdim += 1 metabolites.append(row[0]) raw_data.append([float(i) for i in row[1:-2]]) dso = DataSet(size=(ydim, xdim)) dso.labels[1] = metabolites dso.data = np.array(raw_data).T dso.name = name dso.description = 'Imported from FIMA (%s)' % name return dso
def process_data_to_dso(self, nmr_data, nmr_ppms, sample_labels, experiment_name): print("Processing spectra to dso...") sample_n = len(sample_labels) ppm_n = len(nmr_ppms) dso = DataSet(size=(sample_n, ppm_n)) for n, nd in enumerate(nmr_data): print("Spectra %s" % sample_labels[n]) dso.data[n, :] = nd dso.labels[0][n] = sample_labels[n] dso.labels[1] = [str(ppm) for ppm in nmr_ppms] dso.scales[1] = [float(ppm) for ppm in nmr_ppms] dso.name = experiment_name return dso
def load_metabolights( self, filename, id_col=0, name_col=4, data_col=18 ): # Load from csv with experiments in COLUMNS, metabolites in ROWS print("Loading Metabolights...") #sample 1 2 3 4 #class ADG10003u_007 ADG10003u_008 ADG10003u_009 ADG10003u_010 ADG19007u_192 #2-oxoisovalerate 0.3841 0.44603 0.45971 0.40812 reader = csv.reader(open(filename, 'rU'), delimiter=',', dialect='excel') # Sample identities from top row ( sample labels ) hrow = next(reader) sample_ids = hrow[1:] # Sample classes from second row; crop off after u_ hrow = next(reader) classes = hrow[1:] classes = [c.split('u_')[0] for c in classes] metabolites = [] metabolite_data = [] # Read in metabolite data n.b. can have >1 entry / metabolite so need to allow for this for row in reader: if row[0] != '': # Skip empty rows metabolites.append(row[0]) metabolite_data.append(row[1:]) ydim = len(classes) xdim = len(metabolites) dso = DataSet(size=(ydim, xdim)) dso.labels[0] = sample_ids dso.classes[0] = classes dso.labels[1] = metabolites for n, md in enumerate(metabolite_data): dso.data[:, n] = np.array(md) return dso
def load_datafile(self, file): reader = csv.reader(open(file, 'rU'), delimiter='\t', dialect='excel') hrow = next(reader) # Get top row slabels = [] data = [] if hrow[0] == 'Profiled Data Type': # Is a Chenomx output file; use the other columns to map data scale/etc. once implemented next(reader) # Skip date row hrow = next(reader) labels = hrow[ 2:] # We strip off the pH here; might be nice to keep it entities = [ self.m.db.synrev[l] if l in self.m.db.synrev else None for l in labels ] # Map to entities if they exist next(reader) # Skip compound ID next(reader) # Skip InChI next(reader) # Skip SMILES for hrow in reader: # Now read the data rows slabels.append(hrow[0]) td = [] for x in hrow[2:]: try: td.append(float(x)) except: td.append(0) data.append(td) data = np.array(data) dso = DataSet(size=data.shape) print(data.shape) dso.labels[1] = labels dso.entities[1] = entities dso.labels[0] = slabels dso.data = data return {'output': dso}
def load_datafile(self, filename): # Determine if we've got a csv or peakml file (extension) #self.data.o['output'].empty() dso = DataSet() # Read data in from peakml format file xml = et.parse(filename) # Get sample ids, names and class groupings sets = xml.iterfind('header/sets/set') midclass = {} classes = set() measurements = [] masses = {} for aset in sets: id = aset.find('id').text mids = aset.find('measurementids').text for mid in self.decode(mids): midclass[mid] = id measurements.append(mid) classes.add(id) # We have all the sample data now, parse the intensity and identity info peaksets = xml.iterfind('peaks/peak') quantities = defaultdict(dict) all_identities = [] for peakset in peaksets: # Find metabolite identities annotations = peakset.iterfind('annotations/annotation') identities = False for annotation in annotations: if annotation.find('label').text == 'identification': identities = annotation.find('value').text.split(', ') all_identities.extend(identities) break if identities: # PeakML supports multiple alternative metabolite identities,currently we don't so duplicate # We have identities, now get intensities for the different samples chromatograms = peakset.iterfind( 'peaks/peak') # Next level down for chromatogram in chromatograms: mid = chromatogram.find('measurementid').text intensity = float(chromatogram.find('intensity').text) mass = float(chromatogram.find('mass').text) # Write out to each of the identities table (need to buffer til we have the entire list) for identity in identities: quantities[mid][identity] = intensity # Write out to each of the identities table (need to buffer til we have the entire list) for identity in identities: masses[identity] = mass # Sort the identities/masses into consecutive order # Quantities table built; class table built; now rearrange into dso dso.empty((len(measurements), len(all_identities))) dso.labels[0] = measurements dso.classes[0] = [midclass[mid] for mid in measurements] dso.labels[1] = all_identities db_hmdbids = self.m.db.unification['HMDB'] dso.entities[1] = [ db_hmdbids[hmdbid] if hmdbid in db_hmdbids else None for hmdbid in all_identities ] dso.scales[1] = [float(masses[i]) for i in all_identities] for mid, identities in list(quantities.items()): for identity, intensity in list(identities.items()): r = measurements.index(mid) c = all_identities.index(identity) dso.data[r, c] = intensity dso.name = os.path.basename(filename) dso.description = 'Imported PeakML file' self.set_name(dso.name) return {'output': dso}
def load_datafile(self, filename): # Determine if we've got a csv or peakml file (extension) #self.data.o['output'].empty() dso = DataSet() # Read data in from peakml format file xml = et.parse(filename) # Get sample ids, names and class groupings sets = xml.iterfind('header/sets/set') midclass = {} classes = set() measurements = [] masses = {} for aset in sets: id = aset.find('id').text mids = aset.find('measurementids').text for mid in self.decode(mids): midclass[mid] = id measurements.append(mid) classes.add(id) # We have all the sample data now, parse the intensity and identity info peaksets = xml.iterfind('peaks/peak') quantities = defaultdict(dict) all_identities = [] for peakset in peaksets: # Find metabolite identities annotations = peakset.iterfind('annotations/annotation') identities = False for annotation in annotations: if annotation.find('label').text == 'identification': identities = annotation.find('value').text.split(', ') all_identities.extend(identities) break if identities: # PeakML supports multiple alternative metabolite identities,currently we don't so duplicate # We have identities, now get intensities for the different samples chromatograms = peakset.iterfind('peaks/peak') # Next level down for chromatogram in chromatograms: mid = chromatogram.find('measurementid').text intensity = float(chromatogram.find('intensity').text) mass = float(chromatogram.find('mass').text) # Write out to each of the identities table (need to buffer til we have the entire list) for identity in identities: quantities[mid][identity] = intensity # Write out to each of the identities table (need to buffer til we have the entire list) for identity in identities: masses[identity] = mass # Sort the identities/masses into consecutive order # Quantities table built; class table built; now rearrange into dso dso.empty((len(measurements), len(all_identities))) dso.labels[0] = measurements dso.classes[0] = [midclass[mid] for mid in measurements] dso.labels[1] = all_identities db_hmdbids = self.m.db.unification['HMDB'] dso.entities[1] = [db_hmdbids[hmdbid] if hmdbid in db_hmdbids else None for hmdbid in all_identities] dso.scales[1] = [float(masses[i]) for i in all_identities] for mid, identities in list(quantities.items()): for identity, intensity in list(identities.items()): r = measurements.index(mid) c = all_identities.index(identity) dso.data[r, c] = intensity dso.name = os.path.basename(filename) dso.description = 'Imported PeakML file' self.change_name.emit(dso.name) return {'output': dso}
def load_csv_R(self, filename): # Load from csv with experiments in ROWS, metabolites in COLUMNS # Read in data for the graphing metabolite, with associated value (generate mean) f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter=str(','), dialect='excel') print('R') hrow = next(reader) # Get top row metabolites = hrow[2:] ydim = 0 xdim = len(metabolites) samples = [] classes = [] raw_data = [] # Build quants table for metabolite classes #for metabolite in self.metabolites: # quantities[ metabolite ] = defaultdict(list) for n, row in enumerate(reader): ydim += 1 if row[1] != '.': # Skip excluded classes # row[1] = Class samples.append(row[0]) classes.append(row[1]) data_row = [] for c in row[2:]: # in self.metabolites: try: c = float(c) except: c = 0 data_row.append(c) raw_data.append(data_row) #metabolite_column = hrow.index( metabolite ) #if row[ metabolite_column ]: # data_row.append( # quantities[metabolite][ row[1] ].append( float(row[ metabolite_column ]) ) #self.statistics['ymin'] = min( self.statistics['ymin'], float(row[ metabolite_column ]) ) #self.statistics['ymax'] = max( self.statistics['ymax'], float(row[ metabolite_column ]) ) #else: # quantities[metabolite][ row[1] ].append( 0 ) else: pass if n % 100 == 0: self.progress.emit(float(f.tell()) / fsize) #self.statistics['excluded'] += 1 # Build dataset object dso = DataSet(size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) #dso.labels[1] = metabolites scales = [] mlabels = [] for m in metabolites: try: scales.append(float(m)) mlabels.append(None) except: scales.append(None) mlabels.append(m) dso.scales[1] = [None] * len(samples) dso.labels[0] = samples dso.classes[0] = classes dso.entities[0] = [None] * len(samples) dso.scales[1] = scales dso.labels[1] = mlabels dso.entities[1] = [None] * len(scales) dso.classes[1] = [None] * len(scales) dso.data = np.array(raw_data) return dso
def generate(self, input=None): data = input.data pca = PCA(n_components=self.config.get('number_of_components')) pca.fit(data.T) # Transpose it, as vars need to along the top weights = pca.transform(data.T) # Get weights? # Label up the top 50 (the values are retained; just for clarity) wmx = np.amax(np.absolute(weights), axis=1) dso_z = list(zip(input.scales[1], input.entities[1], input.labels[1])) dso_z = sorted(zip(dso_z, wmx), key=lambda x: x[1])[-50:] # Top 50 dso_z = [x for x, wmx in dso_z] # Build scores into a dso no_of_samples x no_of_principal_components scored = DataSet(size=(len(pca.components_[0]), len(pca.components_))) scored.labels[0] = input.labels[0] scored.classes[0] = input.classes[0] for n, s in enumerate(pca.components_): scored.data[:, n] = s scored.labels[1][n] = 'Principal Component %d (%0.2f%%)' % ( n + 1, pca.explained_variance_ratio_[0] * 100.) dso_pc = {} for n in range(0, weights.shape[1]): pcd = DataSet(size=(1, input.shape[1])) pcd.entities[1] = input.entities[1] pcd.labels[1] = input.labels[1] pcd.scales[1] = input.scales[1] pcd.data = weights[:, n:n + 1].T dso_pc['pc%s' % (n + 1)] = pcd return dict( list({ 'dso': input, 'pca': pca, 'scores': scored, #'weights': weights, 'wmx': wmx, 'dso_z': dso_z, }.items()) + list(dso_pc.items()))
def load_soft_dataset(self, filename): # Load from soft data file for genes # SOFT files are a /sort of/ basterdized csv with data in tab-separated columns # So, we use the csv reader to get that, accounting for most stuff being single field with # slightly strange identifiers f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter='\t', dialect='excel') soft_data = self.preprocess_soft(reader, f=f, fsize=fsize) # soft_data now contains lists of sections with ^ markers database = {} dataset = {} dataset_data = {} subsets = {} for section, rows in list(soft_data.items()): if section.startswith('^DATABASE'): database = self.get_soft_metadata(rows) elif section.startswith('^DATASET'): dataset.update(self.get_soft_metadata(rows)) # update because seems can be >1 entry to dataset data = self.get_soft_data(rows, '!dataset_table_begin', '!dataset_table_end') dataset_data = data elif section.startswith('^SUBSET'): key, subset_id = section.split(' = ') subsets[subset_id] = self.get_soft_metadata(rows) subsets[subset_id]['subset_sample_id'] = subsets[subset_id]['subset_sample_id'].split(',') # Turn to list of ids # We now have the entire dataset loaded; but in a bit of a messed up format # Build a dataset object to fit and map the data in sample_ids = [] for k, subset in list(subsets.items()): sample_ids.extend(subset['subset_sample_id']) sample_ids = sorted(sample_ids) # Get the samples sorted so we keep everything lined up class_lookup = {} for class_id, s in list(subsets.items()): for s_id in s['subset_sample_id']: class_lookup[s_id] = "%s (%s)" % (s['subset_description'] if 'subset_description' in s else '', class_id) xdim = len(dataset_data) # Use first sample to access the gene list ydim = len(sample_ids) # Build dataset object dso = DataSet(size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) gene_ids = sorted(dataset_data.keys()) # Get the keys sorted so we keep everything lined up dso.labels[0] = sample_ids dso.classes[0] = [class_lookup[s_id] for s_id in sample_ids] dso.labels[1] = [dataset_data[gene_id]['IDENTIFIER'] for gene_id in gene_ids] dso.entities[1] = [self.m.db.get_via_synonym(gene_id) for gene_id in dso.labels[1]] for xn, gene_id in enumerate(gene_ids): for yn, sample_id in enumerate(sample_ids): dso.data[yn, xn] = dataset_data[gene_id][sample_id] return dso
def load_csv_C(self, filename): # Load from csv with experiments in COLUMNS, metabolites in ROWS # Read in data for the graphing metabolite, with associated value (generate mean) f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter=str(','), dialect='excel') hrow = next(reader) # Discard top row (sample no's) samples = hrow[1:] hrow = next(reader) # Get 2nd row classesa = hrow[1:] classes = [c for c in classesa if c != '.'] metabolites = [] data = [] added_rows = 0 for n, row in enumerate(reader): metabolite = row[0] metabolites.append(row[0]) quants = [] for cn, c in enumerate(row[1:]): if classesa[cn] != '.': try: data.append(float(c)) except: data.append(0) if n % 100 == 0: self.progress.emit(float(f.tell()) / fsize) data = np.asarray(data) data = np.reshape(data, (n + 1, len(classes))).T xdim = len(quants) ydim = len(classes) # Build dataset object dso = DataSet(size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) dso.labels[1] = metabolites scales = [] mlabels = [] for m in metabolites: try: scales.append(float(m)) mlabels.append(None) except: scales.append(None) mlabels.append(m) dso.scales[0] = [None] * len(samples) dso.labels[0] = samples dso.classes[0] = classes dso.entities[0] = [None] * len(samples) dso.scales[1] = scales dso.labels[1] = mlabels dso.classes[1] = [None] * len(scales) dso.entities[1] = [None] * len(scales) dso.data = data return dso
def generate(self, input=None): data = input.data pca = PCA(n_components=self.config.get('number_of_components')) pca.fit(data) scores = pca.transform(data) # Build scores into a dso no_of_samples x no_of_principal_components scored = DataSet(size=(scores.shape)) scored.labels[0] = input.labels[0] scored.classes[0] = input.classes[0] scored.data = scores for n in range(0, scored.shape[1]): scored.labels[1][n] = 'Principal Component %d (%0.2f%%)' % (n + 1, pca.explained_variance_ratio_[0] * 100.) weightsd = DataSet(size=pca.components_.shape) weightsd.data = pca.components_ weightsd.scales[1] = input.scales[1] dso_pc = {} for n in range(0, pca.components_.shape[0]): pcd = DataSet(size=(1, input.shape[1])) pcd.entities[1] = input.entities[1] pcd.labels[1] = input.labels[1] pcd.scales[1] = input.scales[1] pcd.data = weightsd.data[n:n + 1, :] dso_pc['pc%s' % (n + 1)] = pcd weightsd.labels[0][n] = "PC %s" % (n + 1) #weightsd.classes[0][n] = "PC %s" % (n+1) return dict(list({ 'dso': input, 'pca': pca, 'scores': scored, 'weights': weightsd, }.items()) + list(dso_pc.items()))
def generate(self, input_1=None, input_2=None, input_3=None, input_4=None): #dsi = input # Iterate all the compounds in the current analysis # Assign score to each of the compound's pathways # Sum up, crop and return a list of pathway_ids to display # Pass this in as the list to view # + requested pathways, - excluded pathways db = self.m.db mining_depth = self.config.get('/Data/MiningDepth') mining_type = self.config.get('/Data/MiningType') pathway_scores = defaultdict(int) for dsi in input_1, input_2, input_3, input_4: if dsi == None: continue print("Mining using '%s'" % mining_type) for n, entity in enumerate(dsi.entities[1]): if entity == None: continue # Skip score = dsi.data[0, n] #score = self.analysis[ m_id ]['score'] # 1' neighbours; 2' neighbours etc. add score # Get a list of methods in connected reactions, add their score % to this compound # if m_id in db.compounds.keys(): # n_compounds = [r.compounds for r in db.compounds[ m_id ].reactions ] # print n_compounds # n_compounds = [m for ml in n_compounds for m in ml if n_m.id in self.analysis and m.id != m_id ] # for n_m in n_compounds: # score += self.analysis[ n_m.id ]['score'] * 0.5 # Get the entity's pathways pathways = entity.pathways if pathways == []: continue if self.config.get('/Data/MiningShared'): # Share the change score between the associated pathways # this prevents compounds having undue influence score = score / len(pathways) for p in pathways: mining_val = { 'c': abs(score), 'u': max(0, score), 'd': abs(min(0, score)), 'm': 1.0, 't': score, } pathway_scores[p] += mining_val[mining_type] # If we're using tendency scaling; abs the scores here if mining_type == 't': for p, v in list(pathway_scores.items()): pathway_scores[p] = abs(v) # If we're pruning, then remove any pathways not in keep_pathways if self.config.get('/Data/MiningRelative'): print("Scaling pathway scores to pathway sizes...") for p, v in list(pathway_scores.items()): pathway_scores[p] = float(v) / len(p.reactions) if not pathway_scores: # No data raise BaseException # Now take the accumulated scores; and create the output pathway_scorest = list(pathway_scores.items()) # Switch it to a dict so we can sort pathway_scorest = [(p, v) for p, v in pathway_scorest if v > 0] # Remove any scores of 0 pathway_scorest.sort(key=lambda tup: tup[1], reverse=True) # Sort by scores (either system) # Get top N defined by mining_depth parameter keep_pathways = pathway_scorest[0:mining_depth] remaining_pathways = pathway_scorest[mining_depth + 1:mining_depth + 100] print("Mining recommended %d out of %d" % (len(keep_pathways), len(pathway_scores))) for n, p in enumerate(keep_pathways): print("- %d. %s [%.2f]" % (n + 1, p[0].name, p[1])) #self.analysis['mining_ranked_remaining_pathways'] = [] #if remaining_pathways: # print "Note: Next pathways by current scoring method are..." # for n2,p in enumerate(remaining_pathways): # print "- %d. %s [%.2f]" % (n+n2+1, db.pathways[ p[0] ].name, p[1]) # self.analysis['mining_ranked_remaining_pathways'].append( p[0] ) #self.analysis_suggested_pathways = [db.pathways[p[0]] for p in pathway_scorest] dso = DataSet(size=(1, len(keep_pathways))) dso.entities[1] = [k for k, v in keep_pathways] dso.labels[1] = [k.name for k, v in keep_pathways] dso.data = np.array([v for k, v in keep_pathways], ndmin=2) dso.labels[0][0] = "Pathway mining scores" return {'output': dso}
def generate(self, input=None): dso = input _experiment_test = self.config.get('experiment_test') _experiment_control = self.config.get('experiment_control') data = dso.data plsr = PLSRegression(n_components=self.config.get('number_of_components'), scale=self.config.get('autoscale')) #, algorithm=self.config.get('algorithm')) Y = np.array([0 if c == _experiment_control else 1 for c in dso.classes[0] ]) plsr.fit(data, Y) # Transpose it, as vars need to along the top # Build scores into a dso no_of_samples x no_of_principal_components scored = DataSet(size=(len(plsr.x_scores_),len(plsr.x_scores_[0]))) scored.labels[0] = input.labels[0] scored.classes[0] = input.classes[0] for n,s in enumerate(plsr.x_scores_.T): scored.data[:,n] = s scored.labels[1][n] = 'Latent Variable %d' % (n+1) #, plsr.y_weights_[0][n]) # PLS-DA regions; mean +- 95% confidence in each axis for each cluster cw_x = defaultdict(list) cw_y = defaultdict(list) for c in list(cw_x.keys()): # Calculate mean point cx = np.mean( cw_x[c] ) cy = np.mean( cw_y[c] ) # Calculate 95% CI rx = np.std( cw_x[c] ) *2 # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence ry = np.std( cw_y[c] ) *2 #1.95 * ( / srn) figure_regions.append( (c, cx, cy, rx, ry) ) # Label up the top 50 (the values are retained; just for clarity) wmx = np.amax( np.absolute( plsr.x_weights_), axis=1 ) dso_z = list(zip( dso.scales[1], dso.entities[1], dso.labels[1] )) dso_z = sorted( zip( dso_z, wmx ), key=lambda x: x[1])[-50:] # Top 50 dso_z = [x for x, wmx in dso_z ] weightsd = DataSet(size=plsr.x_weights_.T.shape) weightsd.data = plsr.x_weights_.T weightsd.scales[1] = input.scales[1] dso_lv = {} for n in range(0, plsr.x_weights_.shape[1] ): lvd = DataSet( size=(1, input.shape[1] ) ) lvd.entities[1] = input.entities[1] lvd.labels[1] = input.labels[1] lvd.scales[1] = input.scales[1] lvd.data = plsr.x_weights_[:,n:n+1].T dso_lv['lv%s' % (n+1)] = lvd weightsd.labels[0][n] = "Weights on LV %s" % (n+1) weightsd.classes[0][n] = "LV %s" % (n+1) return dict(list({ 'dso': dso, 'scores':scored, 'weights':weightsd, #'figure_data': figure_data, #'figure_regions': figure_regions, 'y_weights': plsr.y_weights_, 'x_weights': plsr.x_weights_, }.items()) + list(dso_lv.items()) )
def load_csv_C( self, filename ): # Load from csv with experiments in COLUMNS, metabolites in ROWS # Read in data for the graphing metabolite, with associated value (generate mean) f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter=str(','), dialect='excel') hrow = next(reader) # Discard top row (sample no's) samples = hrow[1:] hrow = next(reader) # Get 2nd row classesa = hrow[1:] classes = [c for c in classesa if c != '.'] metabolites = [] data = [] added_rows = 0 for n, row in enumerate(reader): metabolite = row[0] metabolites.append(row[0]) quants = [] for cn, c in enumerate(row[1:]): if classesa[cn] != '.': try: data.append(float(c)) except: data.append(0) if n % 100 == 0: self.progress.emit(float(f.tell()) / fsize) data = np.asarray(data) data = np.reshape(data, (n + 1, len(classes))).T xdim = len(quants) ydim = len(classes) # Build dataset object dso = DataSet( size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) dso.labels[1] = metabolites scales = [] mlabels = [] for m in metabolites: try: scales.append(float(m)) mlabels.append(None) except: scales.append(None) mlabels.append(m) dso.scales[0] = [None] * len(samples) dso.labels[0] = samples dso.classes[0] = classes dso.entities[0] = [None] * len(samples) dso.scales[1] = scales dso.labels[1] = mlabels dso.classes[1] = [None] * len(scales) dso.entities[1] = [None] * len(scales) dso.data = data return dso
def load_csv_R( self, filename ): # Load from csv with experiments in ROWS, metabolites in COLUMNS # Read in data for the graphing metabolite, with associated value (generate mean) f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter=str(','), dialect='excel') print('R') hrow = next(reader) # Get top row metabolites = hrow[2:] ydim = 0 xdim = len(metabolites) samples = [] classes = [] raw_data = [] # Build quants table for metabolite classes #for metabolite in self.metabolites: # quantities[ metabolite ] = defaultdict(list) for n, row in enumerate(reader): ydim += 1 if row[1] != '.': # Skip excluded classes # row[1] = Class samples.append(row[0]) classes.append(row[1]) data_row = [] for c in row[2:]: # in self.metabolites: try: c = float(c) except: c = 0 data_row.append(c) raw_data.append(data_row) #metabolite_column = hrow.index( metabolite ) #if row[ metabolite_column ]: # data_row.append( # quantities[metabolite][ row[1] ].append( float(row[ metabolite_column ]) ) #self.statistics['ymin'] = min( self.statistics['ymin'], float(row[ metabolite_column ]) ) #self.statistics['ymax'] = max( self.statistics['ymax'], float(row[ metabolite_column ]) ) #else: # quantities[metabolite][ row[1] ].append( 0 ) else: pass if n % 100 == 0: self.progress.emit(float(f.tell()) / fsize) #self.statistics['excluded'] += 1 # Build dataset object dso = DataSet( size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) #dso.labels[1] = metabolites scales = [] mlabels = [] for m in metabolites: try: scales.append(float(m)) mlabels.append(None) except: scales.append(None) mlabels.append(m) dso.scales[1] = [None] * len(samples) dso.labels[0] = samples dso.classes[0] = classes dso.entities[0] = [None] * len(samples) dso.scales[1] = scales dso.labels[1] = mlabels dso.entities[1] = [None] * len(scales) dso.classes[1] = [None] * len(scales) dso.data = np.array(raw_data) return dso
def generate(self, input=None): dso = input _experiment_test = self.config.get('experiment_test') _experiment_control = self.config.get('experiment_control') data = dso.data plsr = PLSRegression( n_components=self.config.get('number_of_components'), scale=self.config.get( 'autoscale')) #, algorithm=self.config.get('algorithm')) Y = np.array( [0 if c == _experiment_control else 1 for c in dso.classes[0]]) #Y = Y.reshape( (len(dso.classes[0]),1) ) plsr.fit(data, Y) # Transpose it, as vars need to along the top #figure_data = zip( dso.classes[0], plsr.x_scores_[:,0], plsr.x_scores_[:,1]) # Build scores into a dso no_of_samples x no_of_principal_components scored = DataSet(size=(len(plsr.x_scores_), len(plsr.x_scores_[0]))) scored.labels[0] = input.labels[0] scored.classes[0] = input.classes[0] print(plsr.x_scores_.shape) print(scored.data.shape) for n, s in enumerate(plsr.x_scores_.T): scored.data[:, n] = s scored.labels[1][n] = 'Latent Variable %d (%0.2f%%)' % ( n + 1, plsr.y_weights_[0][0] * 100) # PLS-DA regions; mean +- 95% confidence in each axis for each cluster cw_x = defaultdict(list) cw_y = defaultdict(list) #figure_regions = [] #for c,x,y in figure_data: # cw_x[c].append( x ) # cw_y[c].append( y ) for c in list(cw_x.keys()): # Calculate mean point cx = np.mean(cw_x[c]) cy = np.mean(cw_y[c]) # Calculate 95% CI rx = np.std( cw_x[c] ) * 2 # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence ry = np.std(cw_y[c]) * 2 #1.95 * ( / srn) # Calculate 95% CI #srn = np.sqrt( len( cw_x[c] ) ) # Sample numbers sqrt #rx = 1.95*(np.std( cw_x[c] )/srn ) # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence #ry = 1.95*(np.std( cw_y[c] )/srn ) #1.95 * ( / srn) figure_regions.append((c, cx, cy, rx, ry)) # Label up the top 50 (the values are retained; just for clarity) wmx = np.amax(np.absolute(plsr.x_weights_), axis=1) dso_z = list(zip(dso.scales[1], dso.entities[1], dso.labels[1])) dso_z = sorted(zip(dso_z, wmx), key=lambda x: x[1])[-50:] # Top 50 dso_z = [x for x, wmx in dso_z] dso_lv = {} for n in range(0, plsr.x_weights_.shape[1]): lvd = DataSet(size=(1, input.shape[1])) lvd.entities[1] = input.entities[1] lvd.labels[1] = input.labels[1] lvd.scales[1] = input.scales[1] lvd.data = plsr.x_weights_[:, n:n + 1].T dso_lv['lv%s' % (n + 1)] = lvd return dict( list({ 'dso': dso, 'scores': scored, #'figure_data': figure_data, #'figure_regions': figure_regions, 'y_weights': plsr.y_weights_, 'x_weights': plsr.x_weights_, }.items()) + list(dso_lv.items()))
def generate(self, input_1=None, input_2=None, input_3=None, input_4=None): #dsi = input # Iterate all the compounds in the current analysis # Assign score to each of the compound's pathways # Sum up, crop and return a list of pathway_ids to display # Pass this in as the list to view # + requested pathways, - excluded pathways db = self.m.db mining_depth = self.config.get('/Data/MiningDepth') mining_type = self.config.get('/Data/MiningType') pathway_scores = defaultdict(int) for dsi in input_1, input_2, input_3, input_4: if dsi == None: continue print("Mining using '%s'" % mining_type) for n, entity in enumerate(dsi.entities[1]): if entity == None: continue # Skip score = dsi.data[0, n] #score = self.analysis[ m_id ]['score'] # 1' neighbours; 2' neighbours etc. add score # Get a list of methods in connected reactions, add their score % to this compound # if m_id in db.compounds.keys(): # n_compounds = [r.compounds for r in db.compounds[ m_id ].reactions ] # print n_compounds # n_compounds = [m for ml in n_compounds for m in ml if n_m.id in self.analysis and m.id != m_id ] # for n_m in n_compounds: # score += self.analysis[ n_m.id ]['score'] * 0.5 # Get the entity's pathways pathways = entity.pathways if pathways == []: continue if self.config.get('/Data/MiningShared'): # Share the change score between the associated pathways # this prevents compounds having undue influence score = score / len(pathways) for p in pathways: mining_val = { 'c': abs(score), 'u': max(0, score), 'd': abs(min(0, score)), 'm': 1.0, 't': score, } pathway_scores[p] += mining_val[mining_type] # If we're using tendency scaling; abs the scores here if mining_type == 't': for p, v in list(pathway_scores.items()): pathway_scores[p] = abs(v) # If we're pruning, then remove any pathways not in keep_pathways if self.config.get('/Data/MiningRelative'): print("Scaling pathway scores to pathway sizes...") for p, v in list(pathway_scores.items()): pathway_scores[p] = float(v) / len(p.reactions) if not pathway_scores: # No data raise BaseException # Now take the accumulated scores; and create the output pathway_scorest = list( pathway_scores.items()) # Switch it to a dict so we can sort pathway_scorest = [(p, v) for p, v in pathway_scorest if v > 0] # Remove any scores of 0 pathway_scorest.sort(key=lambda tup: tup[1], reverse=True) # Sort by scores (either system) # Get top N defined by mining_depth parameter keep_pathways = pathway_scorest[0:mining_depth] remaining_pathways = pathway_scorest[mining_depth + 1:mining_depth + 100] print("Mining recommended %d out of %d" % (len(keep_pathways), len(pathway_scores))) for n, p in enumerate(keep_pathways): print("- %d. %s [%.2f]" % (n + 1, p[0].name, p[1])) #self.analysis['mining_ranked_remaining_pathways'] = [] #if remaining_pathways: # print "Note: Next pathways by current scoring method are..." # for n2,p in enumerate(remaining_pathways): # print "- %d. %s [%.2f]" % (n+n2+1, db.pathways[ p[0] ].name, p[1]) # self.analysis['mining_ranked_remaining_pathways'].append( p[0] ) #self.analysis_suggested_pathways = [db.pathways[p[0]] for p in pathway_scorest] dso = DataSet(size=(1, len(keep_pathways))) dso.entities[1] = [k for k, v in keep_pathways] dso.labels[1] = [k.name for k, v in keep_pathways] dso.data = np.array([v for k, v in keep_pathways], ndmin=2) dso.labels[0][0] = "Pathway mining scores" return {'output': dso}