def doWork( options ):
    # open the input and output files
    outCSV = open(options.outfile, 'w')
    inCSV = open(options.infile, 'r')
    if(options.header):
        next(inCSV)
    
    data_array = np.array([])
    names_array = np.array([])
    
    # convert the input csv into an array
    print "loading data..."
    num_rows = 0
    num_cols = 0
    max_rows = int(options.max_rows)
    for line in inCSV:
        row = line.rstrip().split(options.sep)
        if(0 == num_rows):
            num_cols = len(row)
        num_rows += 1  
        names_array = np.append(names_array, [row[0]])
        data_array = np.append(data_array, [float(x) for x in row[1:]])
        if((max_rows != 0) and (num_rows >= max_rows)):
            break
    
    # adjust for the row names
    num_cols -= 1
    print "Loaded" ,num_rows, "rows across" , num_cols, "cols"

    # do the PCA and extract the scores    
    data_array = np.reshape(data_array, (num_rows, num_cols))
    print "Performing PCA ..." ,
    Center(data_array,verbose=0)
    p = PCA(data_array)
    components = p.pc()
    
    # scale PC0
    min_score = float(min(components[:,0]))
    max_score = float(max(components[:,0]))-min_score
    scaled_scores = [(float(x)-min_score)/max_score for x in components[:,0]]
    
    # write to file
    outCSV.write(options.sep.join(["'name'","'value'"])+"\n")
    for index in range (0,num_rows):
        S = 1
        V = 1
        if(options.colors):
            col = "#%s" % "".join(["%0.2X" % k for k in [int(i*255) for i in htr(scaled_scores[index], S, V)]])
            outCSV.write(options.sep.join([str(x) for x in [names_array[index], col]])+"\n")
        else:
            outCSV.write(options.sep.join([str(x) for x in [names_array[index], scaled_scores[index]]])+"\n")
        
    # plot the PCA if we've been asked to...
    if(options.plot):
        figure()
        plot(components[:,0],components[:,1],'*g')
        axis('equal')
        show()   
    
    return 0
Example #2
0
 def createColorMapHSV(self):
     S = 1.0
     V = 1.0
     return LinearSegmentedColormap.from_list('GC', [
         htr((1.0 + np_sin(np_pi * (val / 1000.0) - np_pi / 2)) / 2., S, V)
         for val in xrange(0, 1000)
     ],
                                              N=1000)
Example #3
0
 def getColor(self, vector):
     """return a colour for a given weight vector"""
     sn = np.linalg.norm(vector)
     if sn > 0:
         vv = vector / sn
         ang_perc = self.getAngBetweenNormed(vv, self.cVec)/self.maxAngle
         mag_perc = sn / self.largestMag
     else:
         ang_perc = 0.0
         mag_perc = 0.0
     V = 1       # VAL remain fixed at 1. Reduce to make pastels if that's your preference...
     col = [int(i*255) for i in htr(ang_perc, mag_perc, V)]
     return col
Example #4
0
 def getColor(self, vector):
     """return a colour for a given weight vector"""
     sn = np.linalg.norm(vector)
     if sn > 0:
         vv = vector / sn
         ang_perc = self.getAngBetweenNormed(vv, self.cVec) / self.maxAngle
         mag_perc = sn / self.largestMag
     else:
         ang_perc = 0.0
         mag_perc = 0.0
     V = 1  # VAL remain fixed at 1. Reduce to make pastels if that's your preference...
     col = [int(i * 255) for i in htr(ang_perc, mag_perc, V)]
     return col
Example #5
0
 def randomizeCols(self, setLoaded=False):
     """choose colors randomly"""
     S = 1.0
     V = 1.0
     if setLoaded:
         for bid in self.bin2Str.keys():
             self.loaded[bid] = True
     num_bins = len(self.bin2Str)
     offset = 0.5
     step = 1. /(num_bins-1 + 2. * offset)
     Hs = np.array([step*(i + offset) for i in range(num_bins)])
     cols = [htr(H, S, V) for H in Hs]
     np.random.shuffle(cols)
     i = 0
     for bid in self.bin2Str.keys():
         if self.loaded[bid]:
             # assign the color we picked
             self.bin2Cols[bid] = cols[i]
         else:
             # set to the unbinned color
             self.bin2Cols[bid] = self.unbinnedCol
         i += 1
Example #6
0
def doWork(args):
    """ Main wrapper"""

    esom_name_to_ids = []
    num_esom_names = 0
    all_named_contigs = {}
    # parse names file and make the hash we'll be needing
    try:
        with open(args.names, "r") as fh:
            for line in fh:
                num_esom_names += 1
                parts = line.rstrip().split("\t")
                cid_parts = parts[1].split('_')
                if 'leftover' in cid_parts:
                    base_name = "_".join(cid_parts[:len(cid_parts) - 2])
                else:
                    base_name = "_".join(cid_parts[:len(cid_parts) - 1])
                esom_name_to_ids.append(
                    (base_name, parts)
                )  # ('hydrocarbon_scaffold_82282', ['1', 'hydrocarbon_scaffold_82282_0'])
                all_named_contigs[base_name] = True
    except:
        print "Error opening file:", args.names, sys.exc_info()[0]
        raise

    # parse the GM file
    gm_bin_ids = {}
    cid_2_gmbin = {}
    try:
        with open(args.gmbin, "r") as fh:
            for line in fh:
                parts = line.rstrip().split("\t")
                bid = int(parts[0])
                cid_2_gmbin[parts[1]] = bid
                gm_bin_ids[bid] = True
    except:
        print "Error opening file:", args.names, sys.exc_info()[0]
        raise

    num_gm_bins = len(gm_bin_ids.keys())

    # produce a whole heap of colors
    color_steps = [float(i) / num_gm_bins for i in range(num_gm_bins)]
    S = 1  # SAT and VAL remain fixed at 1. Reduce to make
    V = 1  # Pastels if that's your preference...
    raw_cols = np.array(
        [np.array(htr(val, S, V)) * 255 for val in color_steps])
    bin_cols = [[int(i) for i in j] for j in raw_cols]

    # work out all the bin 0 contigs
    for cid in all_named_contigs:
        if cid not in cid_2_gmbin:
            cid_2_gmbin[cid] = 0

    # build the class file
    print "%d" % num_esom_names + "%"
    print "0%\tNOCLASS\t255\t255\t255"
    for i in range(len(bin_cols)):
        print "%d" % (i + 1) + "%" + "\t%d\t%d\t%d\t%d" % (
            i + 1, bin_cols[i][0], bin_cols[i][1], bin_cols[i][2])

    for i in range(len(esom_name_to_ids)):
        # ('hydrocarbon_scaffold_82282', ['1', 'hydrocarbon_scaffold_82282_0'])
        gm_bid = cid_2_gmbin[esom_name_to_ids[i][0]]
        print "%d\t%d" % (i + 1, gm_bid)
Example #7
0
    def plotRegion(self, px, py, pz, fileName="", tag="", column=False):
        """Plot the region surrounding a point """
        import matplotlib as mpl

        disp_vals = np_array([])
        disp_cols = np_array([])
        num_points = 0
        # plot all points within span
        (z_lower, z_upper) = self.makeCoordRanges(pz, self.span)
        if column:
            z_lower = 0
            z_upper = self.PM.scaleFactor - 1

        (x_lower, x_upper) = self.makeCoordRanges(px, self.span)
        (y_lower, y_upper) = self.makeCoordRanges(py, self.span)
        for z in range(z_lower, z_upper):
            realz = self.PM.scaleFactor - z - 1
            for x in range(x_lower, x_upper):
                for y in range(y_lower, y_upper):
                    if (x, y, realz) in self.im2RowIndicies:
                        for row_index in self.im2RowIndicies[(x, y, realz)]:
                            if (
                                row_index not in self.PM.binnedRowIndicies
                                and row_index not in self.PM.restrictedRowIndicies
                            ):
                                num_points += 1
                                disp_vals = np_append(disp_vals, self.PM.transformedCP[row_index])
                                disp_cols = np_append(disp_cols, self.PM.contigColours[row_index])

        # make a black mark at the max values
        small_span = self.span / 2
        (x_lower, x_upper) = self.makeCoordRanges(px, small_span)
        (y_lower, y_upper) = self.makeCoordRanges(py, small_span)
        (z_lower, z_upper) = self.makeCoordRanges(pz, small_span)
        for z in range(z_lower, z_upper):
            realz = self.PM.scaleFactor - z - 1
            for x in range(x_lower, x_upper):
                for y in range(y_lower, y_upper):
                    if (x, y, realz) in self.im2RowIndicies:
                        for row_index in self.im2RowIndicies[(x, y, realz)]:
                            if (
                                row_index not in self.PM.binnedRowIndicies
                                and row_index not in self.PM.restrictedRowIndicies
                            ):
                                num_points += 1
                                disp_vals = np_append(disp_vals, self.PM.transformedCP[row_index])
                                disp_cols = np_append(disp_cols, htr(0, 0, 0))
        # reshape
        disp_vals = np_reshape(disp_vals, (num_points, 3))
        disp_cols = np_reshape(disp_cols, (num_points, 3))

        fig = plt.figure()
        ax = fig.add_subplot(111, projection="3d")
        cm = mpl.colors.LinearSegmentedColormap("my_colormap", disp_cols, 1024)
        result = ax.scatter(
            disp_vals[:, 0], disp_vals[:, 1], disp_vals[:, 2], edgecolors=disp_cols, c=disp_cols, cmap=cm, marker="."
        )
        title = str.join(" ", ["Focus at: (", str(px), str(py), str(self.PM.scaleFactor - pz - 1), ")\n", tag])
        plt.title(title)

        if fileName != "":
            fig.set_size_inches(6, 6)
            plt.savefig(fileName, dpi=300)
        elif show:
            plt.show()

        plt.close(fig)
        del fig
Example #8
0
def doWork( args ):
    """ Main wrapper"""
    
    esom_name_to_ids = []
    num_esom_names = 0
    all_named_contigs = {}
    # parse names file and make the hash we'll be needing
    try:
        with open(args.names, "r") as fh:
            for line in fh:
                num_esom_names += 1
                parts = line.rstrip().split("\t")
                cid_parts = parts[1].split('_')
                if 'leftover' in cid_parts:
                    base_name = "_".join(cid_parts[:len(cid_parts)-2])
                else:
                    base_name = "_".join(cid_parts[:len(cid_parts)-1])
                esom_name_to_ids.append((base_name, parts)) # ('hydrocarbon_scaffold_82282', ['1', 'hydrocarbon_scaffold_82282_0']) 
                all_named_contigs[base_name] = True
    except: 
        print "Error opening file:", args.names, sys.exc_info()[0]
        raise    

    # parse the GM file
    gm_bin_ids = {}
    cid_2_gmbin = {}
    try:
        with open(args.gmbin, "r") as fh:
            for line in fh:
                parts = line.rstrip().split("\t")
                bid = int(parts[0])
                cid_2_gmbin[parts[1]] = bid 
                gm_bin_ids[bid] = True                 
    except: 
        print "Error opening file:", args.names, sys.exc_info()[0]
        raise    

    num_gm_bins = len(gm_bin_ids.keys())

    # produce a whole heap of colors
    color_steps = [float(i)/num_gm_bins for i in range(num_gm_bins)]
    S = 1       # SAT and VAL remain fixed at 1. Reduce to make
    V = 1       # Pastels if that's your preference...
    raw_cols = np.array([np.array(htr(val, S, V))*255 for val in color_steps])
    bin_cols = [[int(i) for i in j] for j in raw_cols]

    # work out all the bin 0 contigs
    for cid in all_named_contigs:
        if cid not in cid_2_gmbin:
            cid_2_gmbin[cid] = 0
    
    # build the class file
    print "%d" % num_esom_names + "%"
    print "0%\tNOCLASS\t255\t255\t255"
    for i in range(len(bin_cols)):
        print "%d" % (i+1) + "%" + "\t%d\t%d\t%d\t%d" % (i+1, bin_cols[i][0],bin_cols[i][1],bin_cols[i][2])

    for i in range(len(esom_name_to_ids)):
        # ('hydrocarbon_scaffold_82282', ['1', 'hydrocarbon_scaffold_82282_0']) 
        gm_bid = cid_2_gmbin[esom_name_to_ids[i][0]]
        print "%d\t%d" % (i+1,gm_bid)
Example #9
0
 def createColorMapHSV(self):
   S = 1.0
   V = 1.0
   return LinearSegmentedColormap.from_list('GC', [htr((1.0 + np_sin(np_pi * (val/1000.0) - np_pi/2))/2., S, V) for val in xrange(0, 1000)], N=1000)
Example #10
0
    def loadData(self,
                 condition="",              # condition as set by another function
                 bids=[],                   # if this is set then only load those contigs with these bin ids
                 verbose=True,              # many to some output messages
                 silent=False,              # some to no output messages
                 loadCovProfiles=True,
                 loadKmerSigs=True,
                 makeColours=True,
                 loadContigNames=True,
                 loadContigLengths=True,
                 loadBins=False,
                 loadLinks=False):
        """Load pre-parsed data"""
        if(verbose):
            print "Loading data from:", self.dbFileName
        
        # check to see if we need to override the condition
        if(len(bids) != 0):
            condition = "((bid == "+str(bids[0])+")"
            for index in range (1,len(bids)):
                condition += " | (bid == "+str(bids[index])+")"
            condition += ")"
        if(silent):
            verbose=False
        try:
            self.numStoits = self.getNumStoits()
            self.condition = condition
            if(verbose):
                print "    Loading indices (", condition,")"
            self.indices = self.dataManager.getConditionalIndicies(self.dbFileName, condition=condition)
            self.numContigs = len(self.indices)
            
            if(not silent):
                print "    Working with: %d contigs" % self.numContigs

            if(loadCovProfiles):
                if(verbose):
                    print "    Loading coverage profiles"
                self.covProfiles = self.dataManager.getCoverageProfiles(self.dbFileName, indices=self.indices)

                # work out average coverages
                self.averageCoverages = np_array([sum(i)/self.numStoits for i in self.covProfiles])

            if(loadKmerSigs):
                if(verbose):
                    print "    Loading kmer sigs"
                self.kmerSigs = self.dataManager.getKmerSigs(self.dbFileName, indices=self.indices)

                if(makeColours):
                    if(verbose):
                        print "    Creating colour profiles"
                    self.makeColourProfile()
                    # use HSV to RGB to generate colours
                    S = 1       # SAT and VAL remain fixed at 1. Reduce to make
                    V = 1       # Pastels if that's your preference...
                    self.contigColours = np_array([htr(val, S, V) for val in self.kmerVals])

            if(loadContigNames):
                if(verbose):
                    print "    Loading contig names"
                self.contigNames = self.dataManager.getContigNames(self.dbFileName, indices=self.indices)

            if(loadContigLengths):
                if(verbose):
                    print "    Loading contig lengths"
                self.contigLengths = self.dataManager.getContigLengths(self.dbFileName, indices=self.indices)
                print "    Contigs contain %d BP" % ( sum(self.contigLengths) )
            
            if(loadBins):
                if(verbose):
                    print "    Loading bins"
                self.binIds = self.dataManager.getBins(self.dbFileName, indices=self.indices)
                if len(bids) != 0: # need to make sure we're not restricted in terms of bins
                    tmp_bids = self.getBinStats()
                    for bid in bids:
                        self.validBinIds[bid] = tmp_bids[bid]
                else:
                    self.validBinIds = self.getBinStats()

                # fix the binned indices
                self.binnedRowIndicies = {}
                for i in range(len(self.indices)):
                    if(self.binIds[i] != 0):
                        self.binnedRowIndicies[i] = True 
            else:
                # we need zeros as bin indicies then...
                self.binIds = np_zeros(len(self.indices))
                
            if(loadLinks):
                self.loadLinks()
            
        except:
            print "Error loading DB:", self.dbFileName, exc_info()[0]
            raise