def generate(self, input=None): data = input.data pca = PCA(n_components=self.config.get("number_of_components")) pca.fit(data.T) # Transpose it, as vars need to along the top weights = pca.transform(data.T) # Get weights? # Label up the top 50 (the values are retained; just for clarity) wmx = np.amax(np.absolute(weights), axis=1) dso_z = list(zip(input.scales[1], input.entities[1], input.labels[1])) dso_z = sorted(zip(dso_z, wmx), key=lambda x: x[1])[-50:] # Top 50 dso_z = [x for x, wmx in dso_z] # Build scores into a dso no_of_samples x no_of_principal_components scored = DataSet(size=(len(pca.components_[0]), len(pca.components_))) scored.labels[0] = input.labels[0] scored.classes[0] = input.classes[0] for n, s in enumerate(pca.components_): scored.data[:, n] = s scored.labels[1][n] = "Principal Component %d (%0.2f%%)" % (n + 1, pca.explained_variance_ratio_[0] * 100.0) weightsd = DataSet(size=weights.T.shape) weightsd.data = weights.T weightsd.scales[1] = input.scales[1] dso_pc = {} for n in range(0, weights.shape[1]): pcd = DataSet(size=(1, input.shape[1])) pcd.entities[1] = input.entities[1] pcd.labels[1] = input.labels[1] pcd.scales[1] = input.scales[1] pcd.data = weights[:, n : n + 1].T dso_pc["pc%s" % (n + 1)] = pcd weightsd.labels[0][n] = "PC %s" % (n + 1) weightsd.classes[0][n] = "PC %s" % (n + 1) return dict( list({"dso": input, "pca": pca, "scores": scored, "weights": weightsd, "wmx": wmx, "dso_z": dso_z}.items()) + list(dso_pc.items()) )
def generate(self, input=None): pathways = [k for k, v in db.dbm.get_pathways()] pathway_compounds = dict() for k, p in db.dbm.get_pathways(): pathway_compounds[p.id] = set([m for m in p.compounds]) data_m, labels_m = self.build_matrix(pathways, pathway_compounds) pathway_reactions = dict() for k, p in list(db.dbm.pathways.items()): pathway_reactions[p.id] = set([m for m in p.reactions]) data_r, labels_r = self.build_matrix(pathways, pathway_reactions) pathway_active_reactions = dict() pathway_active_compounds = dict() active_pathways = input.entities[1] active_pathways_id = [] for p in active_pathways: pathway_active_reactions[p.id] = set([r for r in p.reactions]) pathway_active_compounds[p.id] = set([r for r in p.compounds]) active_pathways_id.append(p.id) data_ar, labels_ar = self.build_matrix(active_pathways_id, pathway_active_reactions) data_am, labels_am = self.build_matrix(active_pathways_id, pathway_active_compounds) dim = len(data_ar) dso_r = DataSet(size=(dim, dim)) dso_r.data = data_ar dso_r.labels[1] = labels_ar dso_m = DataSet(size=(dim, dim)) dso_m.data = data_am dso_m.labels[1] = labels_am return {'dso_r': dso_r, 'dso_m': dso_m}
def normalise(self, dsi): # Generate bin values for range start_scale to end_scale # Calculate the number of bins at binsize across range dso = DataSet(size=dsi.shape) dso.import_data(dsi) dso.data = self.algorithms[self.config.get('algorithm')](dso.data) # -- optionally use the line widths and take max within each of these for each spectra (peak shiftiness) # Filter the original data with those locations and output\ return dso
def generate(self, input=None): pathways = list(self.m.db.pathways.keys()) pathway_compounds = dict() for k, p in list(self.m.db.pathways.items()): pathway_compounds[p.id] = set([m for m in p.compounds]) data_m, labels_m = self.build_matrix(pathways, pathway_compounds) pathway_reactions = dict() for k, p in list(self.m.db.pathways.items()): pathway_reactions[p.id] = set([m for m in p.reactions]) data_r, labels_r = self.build_matrix(pathways, pathway_reactions) pathway_active_reactions = dict() pathway_active_compounds = dict() active_pathways = input.entities[1] # [self.parent.db.pathways[p] for p in self.parent.config.value('/Pathways/Show').split(',')] active_pathways_id = [] for p in active_pathways: pathway_active_reactions[p.id] = set([r for r in p.reactions]) pathway_active_compounds[p.id] = set([r for r in p.compounds]) active_pathways_id.append(p.id) data_ar, labels_ar = self.build_matrix(active_pathways_id, pathway_active_reactions) data_am, labels_am = self.build_matrix(active_pathways_id, pathway_active_compounds) dim = len(data_ar) dso_r = DataSet(size=(dim, dim)) dso_r.data = data_ar dso_r.labels[1] = labels_ar dso_m = DataSet(size=(dim, dim)) dso_m.data = data_am dso_m.labels[1] = labels_am return {'dso_r': dso_r, 'dso_m': dso_m}
def generate(self, input=None): data = input.data pca = PCA(n_components=self.config.get('number_of_components')) pca.fit(data) scores = pca.transform(data) # Build scores into a dso no_of_samples x no_of_principal_components scored = DataSet(size=(scores.shape)) scored.labels[0] = input.labels[0] scored.classes[0] = input.classes[0] scored.data = scores for n in range(0, scored.shape[1]): scored.labels[1][n] = 'Principal Component %d (%0.2f%%)' % (n + 1, pca.explained_variance_ratio_[0] * 100.) weightsd = DataSet(size=pca.components_.shape) weightsd.data = pca.components_ weightsd.scales[1] = input.scales[1] dso_pc = {} for n in range(0, pca.components_.shape[0]): pcd = DataSet(size=(1, input.shape[1])) pcd.entities[1] = input.entities[1] pcd.labels[1] = input.labels[1] pcd.scales[1] = input.scales[1] pcd.data = weightsd.data[n:n + 1, :] dso_pc['pc%s' % (n + 1)] = pcd weightsd.labels[0][n] = "PC %s" % (n + 1) #weightsd.classes[0][n] = "PC %s" % (n+1) return dict(list({ 'dso': input, 'pca': pca, 'scores': scored, 'weights': weightsd, }.items()) + list(dso_pc.items()))
def generate(self, input=None): data = input.data pca = PCA(n_components=self.config.get('number_of_components')) pca.fit(data.T) # Transpose it, as vars need to along the top weights = pca.transform(data.T) # Get weights? # Label up the top 50 (the values are retained; just for clarity) wmx = np.amax(np.absolute(weights), axis=1) dso_z = list(zip(input.scales[1], input.entities[1], input.labels[1])) dso_z = sorted(zip(dso_z, wmx), key=lambda x: x[1])[-50:] # Top 50 dso_z = [x for x, wmx in dso_z] # Build scores into a dso no_of_samples x no_of_principal_components scored = DataSet(size=(len(pca.components_[0]), len(pca.components_))) scored.labels[0] = input.labels[0] scored.classes[0] = input.classes[0] for n, s in enumerate(pca.components_): scored.data[:, n] = s scored.labels[1][n] = 'Principal Component %d (%0.2f%%)' % ( n + 1, pca.explained_variance_ratio_[0] * 100.) dso_pc = {} for n in range(0, weights.shape[1]): pcd = DataSet(size=(1, input.shape[1])) pcd.entities[1] = input.entities[1] pcd.labels[1] = input.labels[1] pcd.scales[1] = input.scales[1] pcd.data = weights[:, n:n + 1].T dso_pc['pc%s' % (n + 1)] = pcd return dict( list({ 'dso': input, 'pca': pca, 'scores': scored, #'weights': weights, 'wmx': wmx, 'dso_z': dso_z, }.items()) + list(dso_pc.items()))
def load_datafile(self, file): reader = csv.reader(open(file, 'rU'), delimiter='\t', dialect='excel') hrow = next(reader) # Get top row slabels = [] data = [] if hrow[0] == 'Profiled Data Type': # Is a Chenomx output file; use the other columns to map data scale/etc. once implemented next(reader) # Skip date row hrow = next(reader) labels = hrow[ 2:] # We strip off the pH here; might be nice to keep it entities = [ self.m.db.synrev[l] if l in self.m.db.synrev else None for l in labels ] # Map to entities if they exist next(reader) # Skip compound ID next(reader) # Skip InChI next(reader) # Skip SMILES for hrow in reader: # Now read the data rows slabels.append(hrow[0]) td = [] for x in hrow[2:]: try: td.append(float(x)) except: td.append(0) data.append(td) data = np.array(data) dso = DataSet(size=data.shape) print(data.shape) dso.labels[1] = labels dso.entities[1] = entities dso.labels[0] = slabels dso.data = data return {'output': dso}
def load_bml_datafile(self, data_path, target, name): dso = DataSet() # Read in data for the graphing metabolite, with associated value (generate mean) reader = csv.reader(utils.nonull(open(data_path, 'rb')), delimiter='\t', dialect='excel') for row in reader: if row and row[0] == 'metabolite': # Look for the top row break else: return samples = row[1:-2] # Sample identities samples = [sample[8:-1] for sample in samples] xdim = 0 ydim = len(samples) raw_data = [] metabolites = [] for row in reader: xdim += 1 metabolites.append(row[0]) raw_data.append([float(i) for i in row[1:-2]]) dso = DataSet(size=(ydim, xdim)) dso.labels[1] = metabolites dso.data = np.array(raw_data).T dso.name = name dso.description = 'Imported from FIMA (%s)' % name return dso
def load_datafile(self, file): reader = csv.reader(open(file, 'rU'), delimiter='\t', dialect='excel') hrow = next(reader) # Get top row slabels = [] data = [] if hrow[0] == 'Profiled Data Type': # Is a Chenomx output file; use the other columns to map data scale/etc. once implemented next(reader) # Skip date row hrow = next(reader) labels = hrow[2:] # We strip off the pH here; might be nice to keep it entities = [self.m.db.synrev[l] if l in self.m.db.synrev else None for l in labels] # Map to entities if they exist next(reader) # Skip compound ID next(reader) # Skip InChI next(reader) # Skip SMILES for hrow in reader: # Now read the data rows slabels.append(hrow[0]) td = [] for x in hrow[2:]: try: td.append(float(x)) except: td.append(0) data.append(td) data = np.array(data) dso = DataSet(size=data.shape) print(data.shape) dso.labels[1] = labels dso.entities[1] = entities dso.labels[0] = slabels dso.data = data return {'output': dso}
def generate(self, input_1=None, input_2=None, input_3=None, input_4=None): #dsi = input # Iterate all the compounds in the current analysis # Assign score to each of the compound's pathways # Sum up, crop and return a list of pathway_ids to display # Pass this in as the list to view # + requested pathways, - excluded pathways db = self.m.db mining_depth = self.config.get('/Data/MiningDepth') mining_type = self.config.get('/Data/MiningType') pathway_scores = defaultdict(int) for dsi in input_1, input_2, input_3, input_4: if dsi == None: continue print("Mining using '%s'" % mining_type) for n, entity in enumerate(dsi.entities[1]): if entity == None: continue # Skip score = dsi.data[0, n] #score = self.analysis[ m_id ]['score'] # 1' neighbours; 2' neighbours etc. add score # Get a list of methods in connected reactions, add their score % to this compound # if m_id in db.compounds.keys(): # n_compounds = [r.compounds for r in db.compounds[ m_id ].reactions ] # print n_compounds # n_compounds = [m for ml in n_compounds for m in ml if n_m.id in self.analysis and m.id != m_id ] # for n_m in n_compounds: # score += self.analysis[ n_m.id ]['score'] * 0.5 # Get the entity's pathways pathways = entity.pathways if pathways == []: continue if self.config.get('/Data/MiningShared'): # Share the change score between the associated pathways # this prevents compounds having undue influence score = score / len(pathways) for p in pathways: mining_val = { 'c': abs(score), 'u': max(0, score), 'd': abs(min(0, score)), 'm': 1.0, 't': score, } pathway_scores[p] += mining_val[mining_type] # If we're using tendency scaling; abs the scores here if mining_type == 't': for p, v in list(pathway_scores.items()): pathway_scores[p] = abs(v) # If we're pruning, then remove any pathways not in keep_pathways if self.config.get('/Data/MiningRelative'): print("Scaling pathway scores to pathway sizes...") for p, v in list(pathway_scores.items()): pathway_scores[p] = float(v) / len(p.reactions) if not pathway_scores: # No data raise BaseException # Now take the accumulated scores; and create the output pathway_scorest = list(pathway_scores.items()) # Switch it to a dict so we can sort pathway_scorest = [(p, v) for p, v in pathway_scorest if v > 0] # Remove any scores of 0 pathway_scorest.sort(key=lambda tup: tup[1], reverse=True) # Sort by scores (either system) # Get top N defined by mining_depth parameter keep_pathways = pathway_scorest[0:mining_depth] remaining_pathways = pathway_scorest[mining_depth + 1:mining_depth + 100] print("Mining recommended %d out of %d" % (len(keep_pathways), len(pathway_scores))) for n, p in enumerate(keep_pathways): print("- %d. %s [%.2f]" % (n + 1, p[0].name, p[1])) #self.analysis['mining_ranked_remaining_pathways'] = [] #if remaining_pathways: # print "Note: Next pathways by current scoring method are..." # for n2,p in enumerate(remaining_pathways): # print "- %d. %s [%.2f]" % (n+n2+1, db.pathways[ p[0] ].name, p[1]) # self.analysis['mining_ranked_remaining_pathways'].append( p[0] ) #self.analysis_suggested_pathways = [db.pathways[p[0]] for p in pathway_scorest] dso = DataSet(size=(1, len(keep_pathways))) dso.entities[1] = [k for k, v in keep_pathways] dso.labels[1] = [k.name for k, v in keep_pathways] dso.data = np.array([v for k, v in keep_pathways], ndmin=2) dso.labels[0][0] = "Pathway mining scores" return {'output': dso}
def generate(self, input=None): dso = input _experiment_test = self.config.get('experiment_test') _experiment_control = self.config.get('experiment_control') data = dso.data plsr = PLSRegression(n_components=self.config.get('number_of_components'), scale=self.config.get('autoscale')) #, algorithm=self.config.get('algorithm')) Y = np.array([0 if c == _experiment_control else 1 for c in dso.classes[0] ]) plsr.fit(data, Y) # Transpose it, as vars need to along the top # Build scores into a dso no_of_samples x no_of_principal_components scored = DataSet(size=(len(plsr.x_scores_),len(plsr.x_scores_[0]))) scored.labels[0] = input.labels[0] scored.classes[0] = input.classes[0] for n,s in enumerate(plsr.x_scores_.T): scored.data[:,n] = s scored.labels[1][n] = 'Latent Variable %d' % (n+1) #, plsr.y_weights_[0][n]) # PLS-DA regions; mean +- 95% confidence in each axis for each cluster cw_x = defaultdict(list) cw_y = defaultdict(list) for c in list(cw_x.keys()): # Calculate mean point cx = np.mean( cw_x[c] ) cy = np.mean( cw_y[c] ) # Calculate 95% CI rx = np.std( cw_x[c] ) *2 # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence ry = np.std( cw_y[c] ) *2 #1.95 * ( / srn) figure_regions.append( (c, cx, cy, rx, ry) ) # Label up the top 50 (the values are retained; just for clarity) wmx = np.amax( np.absolute( plsr.x_weights_), axis=1 ) dso_z = list(zip( dso.scales[1], dso.entities[1], dso.labels[1] )) dso_z = sorted( zip( dso_z, wmx ), key=lambda x: x[1])[-50:] # Top 50 dso_z = [x for x, wmx in dso_z ] weightsd = DataSet(size=plsr.x_weights_.T.shape) weightsd.data = plsr.x_weights_.T weightsd.scales[1] = input.scales[1] dso_lv = {} for n in range(0, plsr.x_weights_.shape[1] ): lvd = DataSet( size=(1, input.shape[1] ) ) lvd.entities[1] = input.entities[1] lvd.labels[1] = input.labels[1] lvd.scales[1] = input.scales[1] lvd.data = plsr.x_weights_[:,n:n+1].T dso_lv['lv%s' % (n+1)] = lvd weightsd.labels[0][n] = "Weights on LV %s" % (n+1) weightsd.classes[0][n] = "LV %s" % (n+1) return dict(list({ 'dso': dso, 'scores':scored, 'weights':weightsd, #'figure_data': figure_data, #'figure_regions': figure_regions, 'y_weights': plsr.y_weights_, 'x_weights': plsr.x_weights_, }.items()) + list(dso_lv.items()) )
def load_csv_C( self, filename ): # Load from csv with experiments in COLUMNS, metabolites in ROWS # Read in data for the graphing metabolite, with associated value (generate mean) f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter=str(','), dialect='excel') hrow = next(reader) # Discard top row (sample no's) samples = hrow[1:] hrow = next(reader) # Get 2nd row classesa = hrow[1:] classes = [c for c in classesa if c != '.'] metabolites = [] data = [] added_rows = 0 for n, row in enumerate(reader): metabolite = row[0] metabolites.append(row[0]) quants = [] for cn, c in enumerate(row[1:]): if classesa[cn] != '.': try: data.append(float(c)) except: data.append(0) if n % 100 == 0: self.progress.emit(float(f.tell()) / fsize) data = np.asarray(data) data = np.reshape(data, (n + 1, len(classes))).T xdim = len(quants) ydim = len(classes) # Build dataset object dso = DataSet( size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) dso.labels[1] = metabolites scales = [] mlabels = [] for m in metabolites: try: scales.append(float(m)) mlabels.append(None) except: scales.append(None) mlabels.append(m) dso.scales[0] = [None] * len(samples) dso.labels[0] = samples dso.classes[0] = classes dso.entities[0] = [None] * len(samples) dso.scales[1] = scales dso.labels[1] = mlabels dso.classes[1] = [None] * len(scales) dso.entities[1] = [None] * len(scales) dso.data = data return dso
def load_csv_R( self, filename ): # Load from csv with experiments in ROWS, metabolites in COLUMNS # Read in data for the graphing metabolite, with associated value (generate mean) f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter=str(','), dialect='excel') print('R') hrow = next(reader) # Get top row metabolites = hrow[2:] ydim = 0 xdim = len(metabolites) samples = [] classes = [] raw_data = [] # Build quants table for metabolite classes #for metabolite in self.metabolites: # quantities[ metabolite ] = defaultdict(list) for n, row in enumerate(reader): ydim += 1 if row[1] != '.': # Skip excluded classes # row[1] = Class samples.append(row[0]) classes.append(row[1]) data_row = [] for c in row[2:]: # in self.metabolites: try: c = float(c) except: c = 0 data_row.append(c) raw_data.append(data_row) #metabolite_column = hrow.index( metabolite ) #if row[ metabolite_column ]: # data_row.append( # quantities[metabolite][ row[1] ].append( float(row[ metabolite_column ]) ) #self.statistics['ymin'] = min( self.statistics['ymin'], float(row[ metabolite_column ]) ) #self.statistics['ymax'] = max( self.statistics['ymax'], float(row[ metabolite_column ]) ) #else: # quantities[metabolite][ row[1] ].append( 0 ) else: pass if n % 100 == 0: self.progress.emit(float(f.tell()) / fsize) #self.statistics['excluded'] += 1 # Build dataset object dso = DataSet( size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) #dso.labels[1] = metabolites scales = [] mlabels = [] for m in metabolites: try: scales.append(float(m)) mlabels.append(None) except: scales.append(None) mlabels.append(m) dso.scales[1] = [None] * len(samples) dso.labels[0] = samples dso.classes[0] = classes dso.entities[0] = [None] * len(samples) dso.scales[1] = scales dso.labels[1] = mlabels dso.entities[1] = [None] * len(scales) dso.classes[1] = [None] * len(scales) dso.data = np.array(raw_data) return dso
def generate(self, input=None): dso = input _experiment_test = self.config.get('experiment_test') _experiment_control = self.config.get('experiment_control') data = dso.data plsr = PLSRegression( n_components=self.config.get('number_of_components'), scale=self.config.get( 'autoscale')) #, algorithm=self.config.get('algorithm')) Y = np.array( [0 if c == _experiment_control else 1 for c in dso.classes[0]]) #Y = Y.reshape( (len(dso.classes[0]),1) ) plsr.fit(data, Y) # Transpose it, as vars need to along the top #figure_data = zip( dso.classes[0], plsr.x_scores_[:,0], plsr.x_scores_[:,1]) # Build scores into a dso no_of_samples x no_of_principal_components scored = DataSet(size=(len(plsr.x_scores_), len(plsr.x_scores_[0]))) scored.labels[0] = input.labels[0] scored.classes[0] = input.classes[0] print(plsr.x_scores_.shape) print(scored.data.shape) for n, s in enumerate(plsr.x_scores_.T): scored.data[:, n] = s scored.labels[1][n] = 'Latent Variable %d (%0.2f%%)' % ( n + 1, plsr.y_weights_[0][0] * 100) # PLS-DA regions; mean +- 95% confidence in each axis for each cluster cw_x = defaultdict(list) cw_y = defaultdict(list) #figure_regions = [] #for c,x,y in figure_data: # cw_x[c].append( x ) # cw_y[c].append( y ) for c in list(cw_x.keys()): # Calculate mean point cx = np.mean(cw_x[c]) cy = np.mean(cw_y[c]) # Calculate 95% CI rx = np.std( cw_x[c] ) * 2 # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence ry = np.std(cw_y[c]) * 2 #1.95 * ( / srn) # Calculate 95% CI #srn = np.sqrt( len( cw_x[c] ) ) # Sample numbers sqrt #rx = 1.95*(np.std( cw_x[c] )/srn ) # 2sd = 95% #1.95 * ( / srn) # 1.95 * SEM => 95% confidence #ry = 1.95*(np.std( cw_y[c] )/srn ) #1.95 * ( / srn) figure_regions.append((c, cx, cy, rx, ry)) # Label up the top 50 (the values are retained; just for clarity) wmx = np.amax(np.absolute(plsr.x_weights_), axis=1) dso_z = list(zip(dso.scales[1], dso.entities[1], dso.labels[1])) dso_z = sorted(zip(dso_z, wmx), key=lambda x: x[1])[-50:] # Top 50 dso_z = [x for x, wmx in dso_z] dso_lv = {} for n in range(0, plsr.x_weights_.shape[1]): lvd = DataSet(size=(1, input.shape[1])) lvd.entities[1] = input.entities[1] lvd.labels[1] = input.labels[1] lvd.scales[1] = input.scales[1] lvd.data = plsr.x_weights_[:, n:n + 1].T dso_lv['lv%s' % (n + 1)] = lvd return dict( list({ 'dso': dso, 'scores': scored, #'figure_data': figure_data, #'figure_regions': figure_regions, 'y_weights': plsr.y_weights_, 'x_weights': plsr.x_weights_, }.items()) + list(dso_lv.items()))
def generate(self, input_1=None, input_2=None, input_3=None, input_4=None): #dsi = input # Iterate all the compounds in the current analysis # Assign score to each of the compound's pathways # Sum up, crop and return a list of pathway_ids to display # Pass this in as the list to view # + requested pathways, - excluded pathways db = self.m.db mining_depth = self.config.get('/Data/MiningDepth') mining_type = self.config.get('/Data/MiningType') pathway_scores = defaultdict(int) for dsi in input_1, input_2, input_3, input_4: if dsi == None: continue print("Mining using '%s'" % mining_type) for n, entity in enumerate(dsi.entities[1]): if entity == None: continue # Skip score = dsi.data[0, n] #score = self.analysis[ m_id ]['score'] # 1' neighbours; 2' neighbours etc. add score # Get a list of methods in connected reactions, add their score % to this compound # if m_id in db.compounds.keys(): # n_compounds = [r.compounds for r in db.compounds[ m_id ].reactions ] # print n_compounds # n_compounds = [m for ml in n_compounds for m in ml if n_m.id in self.analysis and m.id != m_id ] # for n_m in n_compounds: # score += self.analysis[ n_m.id ]['score'] * 0.5 # Get the entity's pathways pathways = entity.pathways if pathways == []: continue if self.config.get('/Data/MiningShared'): # Share the change score between the associated pathways # this prevents compounds having undue influence score = score / len(pathways) for p in pathways: mining_val = { 'c': abs(score), 'u': max(0, score), 'd': abs(min(0, score)), 'm': 1.0, 't': score, } pathway_scores[p] += mining_val[mining_type] # If we're using tendency scaling; abs the scores here if mining_type == 't': for p, v in list(pathway_scores.items()): pathway_scores[p] = abs(v) # If we're pruning, then remove any pathways not in keep_pathways if self.config.get('/Data/MiningRelative'): print("Scaling pathway scores to pathway sizes...") for p, v in list(pathway_scores.items()): pathway_scores[p] = float(v) / len(p.reactions) if not pathway_scores: # No data raise BaseException # Now take the accumulated scores; and create the output pathway_scorest = list( pathway_scores.items()) # Switch it to a dict so we can sort pathway_scorest = [(p, v) for p, v in pathway_scorest if v > 0] # Remove any scores of 0 pathway_scorest.sort(key=lambda tup: tup[1], reverse=True) # Sort by scores (either system) # Get top N defined by mining_depth parameter keep_pathways = pathway_scorest[0:mining_depth] remaining_pathways = pathway_scorest[mining_depth + 1:mining_depth + 100] print("Mining recommended %d out of %d" % (len(keep_pathways), len(pathway_scores))) for n, p in enumerate(keep_pathways): print("- %d. %s [%.2f]" % (n + 1, p[0].name, p[1])) #self.analysis['mining_ranked_remaining_pathways'] = [] #if remaining_pathways: # print "Note: Next pathways by current scoring method are..." # for n2,p in enumerate(remaining_pathways): # print "- %d. %s [%.2f]" % (n+n2+1, db.pathways[ p[0] ].name, p[1]) # self.analysis['mining_ranked_remaining_pathways'].append( p[0] ) #self.analysis_suggested_pathways = [db.pathways[p[0]] for p in pathway_scorest] dso = DataSet(size=(1, len(keep_pathways))) dso.entities[1] = [k for k, v in keep_pathways] dso.labels[1] = [k.name for k, v in keep_pathways] dso.data = np.array([v for k, v in keep_pathways], ndmin=2) dso.labels[0][0] = "Pathway mining scores" return {'output': dso}
def load_csv_C(self, filename): # Load from csv with experiments in COLUMNS, metabolites in ROWS # Read in data for the graphing metabolite, with associated value (generate mean) f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter=str(','), dialect='excel') hrow = next(reader) # Discard top row (sample no's) samples = hrow[1:] hrow = next(reader) # Get 2nd row classesa = hrow[1:] classes = [c for c in classesa if c != '.'] metabolites = [] data = [] added_rows = 0 for n, row in enumerate(reader): metabolite = row[0] metabolites.append(row[0]) quants = [] for cn, c in enumerate(row[1:]): if classesa[cn] != '.': try: data.append(float(c)) except: data.append(0) if n % 100 == 0: self.progress.emit(float(f.tell()) / fsize) data = np.asarray(data) data = np.reshape(data, (n + 1, len(classes))).T xdim = len(quants) ydim = len(classes) # Build dataset object dso = DataSet(size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) dso.labels[1] = metabolites scales = [] mlabels = [] for m in metabolites: try: scales.append(float(m)) mlabels.append(None) except: scales.append(None) mlabels.append(m) dso.scales[0] = [None] * len(samples) dso.labels[0] = samples dso.classes[0] = classes dso.entities[0] = [None] * len(samples) dso.scales[1] = scales dso.labels[1] = mlabels dso.classes[1] = [None] * len(scales) dso.entities[1] = [None] * len(scales) dso.data = data return dso
def load_csv_R(self, filename): # Load from csv with experiments in ROWS, metabolites in COLUMNS # Read in data for the graphing metabolite, with associated value (generate mean) f = open(filename, 'rU') fsize = os.path.getsize(filename) reader = csv.reader(f, delimiter=str(','), dialect='excel') print('R') hrow = next(reader) # Get top row metabolites = hrow[2:] ydim = 0 xdim = len(metabolites) samples = [] classes = [] raw_data = [] # Build quants table for metabolite classes #for metabolite in self.metabolites: # quantities[ metabolite ] = defaultdict(list) for n, row in enumerate(reader): ydim += 1 if row[1] != '.': # Skip excluded classes # row[1] = Class samples.append(row[0]) classes.append(row[1]) data_row = [] for c in row[2:]: # in self.metabolites: try: c = float(c) except: c = 0 data_row.append(c) raw_data.append(data_row) #metabolite_column = hrow.index( metabolite ) #if row[ metabolite_column ]: # data_row.append( # quantities[metabolite][ row[1] ].append( float(row[ metabolite_column ]) ) #self.statistics['ymin'] = min( self.statistics['ymin'], float(row[ metabolite_column ]) ) #self.statistics['ymax'] = max( self.statistics['ymax'], float(row[ metabolite_column ]) ) #else: # quantities[metabolite][ row[1] ].append( 0 ) else: pass if n % 100 == 0: self.progress.emit(float(f.tell()) / fsize) #self.statistics['excluded'] += 1 # Build dataset object dso = DataSet(size=(xdim, ydim)) # self.add_data('imported_data', DataSetself) ) dso.empty(size=(ydim, xdim)) #dso.labels[1] = metabolites scales = [] mlabels = [] for m in metabolites: try: scales.append(float(m)) mlabels.append(None) except: scales.append(None) mlabels.append(m) dso.scales[1] = [None] * len(samples) dso.labels[0] = samples dso.classes[0] = classes dso.entities[0] = [None] * len(samples) dso.scales[1] = scales dso.labels[1] = mlabels dso.entities[1] = [None] * len(scales) dso.classes[1] = [None] * len(scales) dso.data = np.array(raw_data) return dso