def doWork( options ): # open the input and output files outCSV = open(options.outfile, 'w') inCSV = open(options.infile, 'r') if(options.header): next(inCSV) data_array = np.array([]) names_array = np.array([]) # convert the input csv into an array print "loading data..." num_rows = 0 num_cols = 0 max_rows = int(options.max_rows) for line in inCSV: row = line.rstrip().split(options.sep) if(0 == num_rows): num_cols = len(row) num_rows += 1 names_array = np.append(names_array, [row[0]]) data_array = np.append(data_array, [float(x) for x in row[1:]]) if((max_rows != 0) and (num_rows >= max_rows)): break # adjust for the row names num_cols -= 1 print "Loaded" ,num_rows, "rows across" , num_cols, "cols" # do the PCA and extract the scores data_array = np.reshape(data_array, (num_rows, num_cols)) print "Performing PCA ..." , Center(data_array,verbose=0) p = PCA(data_array) components = p.pc() # scale PC0 min_score = float(min(components[:,0])) max_score = float(max(components[:,0]))-min_score scaled_scores = [(float(x)-min_score)/max_score for x in components[:,0]] # write to file outCSV.write(options.sep.join(["'name'","'value'"])+"\n") for index in range (0,num_rows): S = 1 V = 1 if(options.colors): col = "#%s" % "".join(["%0.2X" % k for k in [int(i*255) for i in htr(scaled_scores[index], S, V)]]) outCSV.write(options.sep.join([str(x) for x in [names_array[index], col]])+"\n") else: outCSV.write(options.sep.join([str(x) for x in [names_array[index], scaled_scores[index]]])+"\n") # plot the PCA if we've been asked to... if(options.plot): figure() plot(components[:,0],components[:,1],'*g') axis('equal') show() return 0
def createColorMapHSV(self): S = 1.0 V = 1.0 return LinearSegmentedColormap.from_list('GC', [ htr((1.0 + np_sin(np_pi * (val / 1000.0) - np_pi / 2)) / 2., S, V) for val in xrange(0, 1000) ], N=1000)
def getColor(self, vector): """return a colour for a given weight vector""" sn = np.linalg.norm(vector) if sn > 0: vv = vector / sn ang_perc = self.getAngBetweenNormed(vv, self.cVec)/self.maxAngle mag_perc = sn / self.largestMag else: ang_perc = 0.0 mag_perc = 0.0 V = 1 # VAL remain fixed at 1. Reduce to make pastels if that's your preference... col = [int(i*255) for i in htr(ang_perc, mag_perc, V)] return col
def getColor(self, vector): """return a colour for a given weight vector""" sn = np.linalg.norm(vector) if sn > 0: vv = vector / sn ang_perc = self.getAngBetweenNormed(vv, self.cVec) / self.maxAngle mag_perc = sn / self.largestMag else: ang_perc = 0.0 mag_perc = 0.0 V = 1 # VAL remain fixed at 1. Reduce to make pastels if that's your preference... col = [int(i * 255) for i in htr(ang_perc, mag_perc, V)] return col
def randomizeCols(self, setLoaded=False): """choose colors randomly""" S = 1.0 V = 1.0 if setLoaded: for bid in self.bin2Str.keys(): self.loaded[bid] = True num_bins = len(self.bin2Str) offset = 0.5 step = 1. /(num_bins-1 + 2. * offset) Hs = np.array([step*(i + offset) for i in range(num_bins)]) cols = [htr(H, S, V) for H in Hs] np.random.shuffle(cols) i = 0 for bid in self.bin2Str.keys(): if self.loaded[bid]: # assign the color we picked self.bin2Cols[bid] = cols[i] else: # set to the unbinned color self.bin2Cols[bid] = self.unbinnedCol i += 1
def doWork(args): """ Main wrapper""" esom_name_to_ids = [] num_esom_names = 0 all_named_contigs = {} # parse names file and make the hash we'll be needing try: with open(args.names, "r") as fh: for line in fh: num_esom_names += 1 parts = line.rstrip().split("\t") cid_parts = parts[1].split('_') if 'leftover' in cid_parts: base_name = "_".join(cid_parts[:len(cid_parts) - 2]) else: base_name = "_".join(cid_parts[:len(cid_parts) - 1]) esom_name_to_ids.append( (base_name, parts) ) # ('hydrocarbon_scaffold_82282', ['1', 'hydrocarbon_scaffold_82282_0']) all_named_contigs[base_name] = True except: print "Error opening file:", args.names, sys.exc_info()[0] raise # parse the GM file gm_bin_ids = {} cid_2_gmbin = {} try: with open(args.gmbin, "r") as fh: for line in fh: parts = line.rstrip().split("\t") bid = int(parts[0]) cid_2_gmbin[parts[1]] = bid gm_bin_ids[bid] = True except: print "Error opening file:", args.names, sys.exc_info()[0] raise num_gm_bins = len(gm_bin_ids.keys()) # produce a whole heap of colors color_steps = [float(i) / num_gm_bins for i in range(num_gm_bins)] S = 1 # SAT and VAL remain fixed at 1. Reduce to make V = 1 # Pastels if that's your preference... raw_cols = np.array( [np.array(htr(val, S, V)) * 255 for val in color_steps]) bin_cols = [[int(i) for i in j] for j in raw_cols] # work out all the bin 0 contigs for cid in all_named_contigs: if cid not in cid_2_gmbin: cid_2_gmbin[cid] = 0 # build the class file print "%d" % num_esom_names + "%" print "0%\tNOCLASS\t255\t255\t255" for i in range(len(bin_cols)): print "%d" % (i + 1) + "%" + "\t%d\t%d\t%d\t%d" % ( i + 1, bin_cols[i][0], bin_cols[i][1], bin_cols[i][2]) for i in range(len(esom_name_to_ids)): # ('hydrocarbon_scaffold_82282', ['1', 'hydrocarbon_scaffold_82282_0']) gm_bid = cid_2_gmbin[esom_name_to_ids[i][0]] print "%d\t%d" % (i + 1, gm_bid)
def plotRegion(self, px, py, pz, fileName="", tag="", column=False): """Plot the region surrounding a point """ import matplotlib as mpl disp_vals = np_array([]) disp_cols = np_array([]) num_points = 0 # plot all points within span (z_lower, z_upper) = self.makeCoordRanges(pz, self.span) if column: z_lower = 0 z_upper = self.PM.scaleFactor - 1 (x_lower, x_upper) = self.makeCoordRanges(px, self.span) (y_lower, y_upper) = self.makeCoordRanges(py, self.span) for z in range(z_lower, z_upper): realz = self.PM.scaleFactor - z - 1 for x in range(x_lower, x_upper): for y in range(y_lower, y_upper): if (x, y, realz) in self.im2RowIndicies: for row_index in self.im2RowIndicies[(x, y, realz)]: if ( row_index not in self.PM.binnedRowIndicies and row_index not in self.PM.restrictedRowIndicies ): num_points += 1 disp_vals = np_append(disp_vals, self.PM.transformedCP[row_index]) disp_cols = np_append(disp_cols, self.PM.contigColours[row_index]) # make a black mark at the max values small_span = self.span / 2 (x_lower, x_upper) = self.makeCoordRanges(px, small_span) (y_lower, y_upper) = self.makeCoordRanges(py, small_span) (z_lower, z_upper) = self.makeCoordRanges(pz, small_span) for z in range(z_lower, z_upper): realz = self.PM.scaleFactor - z - 1 for x in range(x_lower, x_upper): for y in range(y_lower, y_upper): if (x, y, realz) in self.im2RowIndicies: for row_index in self.im2RowIndicies[(x, y, realz)]: if ( row_index not in self.PM.binnedRowIndicies and row_index not in self.PM.restrictedRowIndicies ): num_points += 1 disp_vals = np_append(disp_vals, self.PM.transformedCP[row_index]) disp_cols = np_append(disp_cols, htr(0, 0, 0)) # reshape disp_vals = np_reshape(disp_vals, (num_points, 3)) disp_cols = np_reshape(disp_cols, (num_points, 3)) fig = plt.figure() ax = fig.add_subplot(111, projection="3d") cm = mpl.colors.LinearSegmentedColormap("my_colormap", disp_cols, 1024) result = ax.scatter( disp_vals[:, 0], disp_vals[:, 1], disp_vals[:, 2], edgecolors=disp_cols, c=disp_cols, cmap=cm, marker="." ) title = str.join(" ", ["Focus at: (", str(px), str(py), str(self.PM.scaleFactor - pz - 1), ")\n", tag]) plt.title(title) if fileName != "": fig.set_size_inches(6, 6) plt.savefig(fileName, dpi=300) elif show: plt.show() plt.close(fig) del fig
def doWork( args ): """ Main wrapper""" esom_name_to_ids = [] num_esom_names = 0 all_named_contigs = {} # parse names file and make the hash we'll be needing try: with open(args.names, "r") as fh: for line in fh: num_esom_names += 1 parts = line.rstrip().split("\t") cid_parts = parts[1].split('_') if 'leftover' in cid_parts: base_name = "_".join(cid_parts[:len(cid_parts)-2]) else: base_name = "_".join(cid_parts[:len(cid_parts)-1]) esom_name_to_ids.append((base_name, parts)) # ('hydrocarbon_scaffold_82282', ['1', 'hydrocarbon_scaffold_82282_0']) all_named_contigs[base_name] = True except: print "Error opening file:", args.names, sys.exc_info()[0] raise # parse the GM file gm_bin_ids = {} cid_2_gmbin = {} try: with open(args.gmbin, "r") as fh: for line in fh: parts = line.rstrip().split("\t") bid = int(parts[0]) cid_2_gmbin[parts[1]] = bid gm_bin_ids[bid] = True except: print "Error opening file:", args.names, sys.exc_info()[0] raise num_gm_bins = len(gm_bin_ids.keys()) # produce a whole heap of colors color_steps = [float(i)/num_gm_bins for i in range(num_gm_bins)] S = 1 # SAT and VAL remain fixed at 1. Reduce to make V = 1 # Pastels if that's your preference... raw_cols = np.array([np.array(htr(val, S, V))*255 for val in color_steps]) bin_cols = [[int(i) for i in j] for j in raw_cols] # work out all the bin 0 contigs for cid in all_named_contigs: if cid not in cid_2_gmbin: cid_2_gmbin[cid] = 0 # build the class file print "%d" % num_esom_names + "%" print "0%\tNOCLASS\t255\t255\t255" for i in range(len(bin_cols)): print "%d" % (i+1) + "%" + "\t%d\t%d\t%d\t%d" % (i+1, bin_cols[i][0],bin_cols[i][1],bin_cols[i][2]) for i in range(len(esom_name_to_ids)): # ('hydrocarbon_scaffold_82282', ['1', 'hydrocarbon_scaffold_82282_0']) gm_bid = cid_2_gmbin[esom_name_to_ids[i][0]] print "%d\t%d" % (i+1,gm_bid)
def createColorMapHSV(self): S = 1.0 V = 1.0 return LinearSegmentedColormap.from_list('GC', [htr((1.0 + np_sin(np_pi * (val/1000.0) - np_pi/2))/2., S, V) for val in xrange(0, 1000)], N=1000)
def loadData(self, condition="", # condition as set by another function bids=[], # if this is set then only load those contigs with these bin ids verbose=True, # many to some output messages silent=False, # some to no output messages loadCovProfiles=True, loadKmerSigs=True, makeColours=True, loadContigNames=True, loadContigLengths=True, loadBins=False, loadLinks=False): """Load pre-parsed data""" if(verbose): print "Loading data from:", self.dbFileName # check to see if we need to override the condition if(len(bids) != 0): condition = "((bid == "+str(bids[0])+")" for index in range (1,len(bids)): condition += " | (bid == "+str(bids[index])+")" condition += ")" if(silent): verbose=False try: self.numStoits = self.getNumStoits() self.condition = condition if(verbose): print " Loading indices (", condition,")" self.indices = self.dataManager.getConditionalIndicies(self.dbFileName, condition=condition) self.numContigs = len(self.indices) if(not silent): print " Working with: %d contigs" % self.numContigs if(loadCovProfiles): if(verbose): print " Loading coverage profiles" self.covProfiles = self.dataManager.getCoverageProfiles(self.dbFileName, indices=self.indices) # work out average coverages self.averageCoverages = np_array([sum(i)/self.numStoits for i in self.covProfiles]) if(loadKmerSigs): if(verbose): print " Loading kmer sigs" self.kmerSigs = self.dataManager.getKmerSigs(self.dbFileName, indices=self.indices) if(makeColours): if(verbose): print " Creating colour profiles" self.makeColourProfile() # use HSV to RGB to generate colours S = 1 # SAT and VAL remain fixed at 1. Reduce to make V = 1 # Pastels if that's your preference... self.contigColours = np_array([htr(val, S, V) for val in self.kmerVals]) if(loadContigNames): if(verbose): print " Loading contig names" self.contigNames = self.dataManager.getContigNames(self.dbFileName, indices=self.indices) if(loadContigLengths): if(verbose): print " Loading contig lengths" self.contigLengths = self.dataManager.getContigLengths(self.dbFileName, indices=self.indices) print " Contigs contain %d BP" % ( sum(self.contigLengths) ) if(loadBins): if(verbose): print " Loading bins" self.binIds = self.dataManager.getBins(self.dbFileName, indices=self.indices) if len(bids) != 0: # need to make sure we're not restricted in terms of bins tmp_bids = self.getBinStats() for bid in bids: self.validBinIds[bid] = tmp_bids[bid] else: self.validBinIds = self.getBinStats() # fix the binned indices self.binnedRowIndicies = {} for i in range(len(self.indices)): if(self.binIds[i] != 0): self.binnedRowIndicies[i] = True else: # we need zeros as bin indicies then... self.binIds = np_zeros(len(self.indices)) if(loadLinks): self.loadLinks() except: print "Error loading DB:", self.dbFileName, exc_info()[0] raise