Esempio n. 1
0
    def expandSelection(self, startIndex, vals, stdevCutoff=0.05, maxSpread=0.1):
        """Expand a selection left and right from a staring index in a list of values
        
        Keep expanding unless the stdev of the values goes above the cutoff
        Return a list of indices into the original list
        """
        ret_list = [startIndex]  # this is what we will give back
        start_val = vals[startIndex]
        value_store = [start_val]

        sorted_indices = np_argsort(vals)
        max_index = len(vals)

        # set the upper and lower to point to the position
        # where the start resides
        lower_index = 0
        upper_index = 0
        for i in range(max_index):
            if sorted_indices[i] == startIndex:
                break
            lower_index += 1
            upper_index += 1
        do_lower = True
        do_upper = True
        max_index -= 1

        while do_lower or do_upper:
            if do_lower:
                do_lower = False
                if lower_index > 0:
                    try_val = vals[sorted_indices[lower_index - 1]]
                    if np_abs(try_val - start_val) < maxSpread:
                        try_array = value_store + [try_val]
                        if np_std(try_array) < stdevCutoff:
                            value_store = try_array
                            lower_index -= 1
                            ret_list.append(sorted_indices[lower_index])
                            do_lower = True
            if do_upper:
                do_upper = False
                if upper_index < max_index:
                    try_val = vals[sorted_indices[upper_index + 1]]
                    if np_abs(try_val - start_val) < maxSpread:
                        try_array = value_store + [try_val]
                        if np_std(try_array) < stdevCutoff:
                            value_store = try_array
                            upper_index += 1
                            ret_list.append(sorted_indices[upper_index])
                            do_upper = True
        return sorted(ret_list)
Esempio n. 2
0
    def get_filtered(self, condition, sort_by_col=None):
        """
        Filter and optionally sort asset metrics by condition
        :param condition: boolean array (example: (mf['some_metric'] > 0) & (mf['another_metric'] == 1) )
        :param sort_by_col: (optional) column name to sort results (sort order is always ascending)
        :return: tuple of arrays ( sorted_assets_array, sorted_metrics_data_matrix)
        """
        flt_idx = self._indexes[condition]
        _flt_data = self._data.take(flt_idx, axis=0)
        if sort_by_col is not None:
            col_id = self._columns[sort_by_col]

            srt_idx = np_argsort(_flt_data[:, col_id])
            orig_sorted_idx = np_take(flt_idx, srt_idx, axis=0)
            return np_take(self._assets_list,
                           orig_sorted_idx), np_take(_flt_data,
                                                     srt_idx,
                                                     axis=0)
        else:
            return np_take(self._assets_list, flt_idx), self._data[flt_idx, :]
Esempio n. 3
0
    def findArrayCenter(self, vals):
        """Find the center of the numpy array vals, return the index of the center"""
        # parameters
        current_val_max = -1
        delta = 0
        bounce_amount = 0.1
        height = 0
        last_val = 0

        working = np_array([])
        final_index = -1

        # sort and normalise between 0 -> 1
        sorted_indices = np_argsort(vals)
        vals_sorted = [vals[i] for i in sorted_indices]
        vals_sorted -= vals_sorted[0]
        if vals_sorted[-1] != 0:
            vals_sorted /= vals_sorted[-1]

        # print vals_sorted

        # run through in one direction
        for val in vals_sorted:
            # calculate delta
            delta = val - last_val
            # reduce the current value according to the delta value
            height = self.reduceViaDelta(height, bounce_amount, delta)
            # bounce the ball up
            height += bounce_amount

            # store the height
            working = np_append(working, height)
            final_index += 1

            # save the last val
            last_val = val

        current_val_max = -1
        height = 0
        last_val = 0

        # print "===W==="
        # print working
        # print "===E==="

        # run through in the reverse direction
        vals_sorted = vals_sorted[::-1]
        for val in vals_sorted:
            if last_val == 0:
                delta = 0
            else:
                delta = last_val - val
            height = self.reduceViaDelta(height, bounce_amount, delta)
            height += bounce_amount
            # add to the old heights
            working[final_index] += height
            final_index -= 1
            last_val = val

        # print working
        # print "==EEE=="

        # find the original index!
        return sorted_indices[np_argmax(working)]
Esempio n. 4
0
def find_max_neig(neig_list,g1,perc,model,scaler,inputs):

    n_maxs = len(neig_list)
    if n_maxs == 0:
        return None    
    if n_maxs > 10:
        # Ascending order         
        n_maxs = int(np_ceil(perc*len(neig_list)))    
    
    neig_key_list = [k for k in neig_list]
    neig_wt_list = [float(neig_list[k]['weight']) for k in neig_list]
    sorted_ind = np_argsort(neig_wt_list)
    sorted_wts = [{'weight':val} for val in np_sort(neig_wt_list)][-n_maxs:]
    sorted_neig_keys = [neig_key_list[i] for i in sorted_ind][-n_maxs:]
    imp_neigs = dict(zip(sorted_neig_keys,sorted_wts))
    
    folNm = inputs['folNm']
    
    if len(imp_neigs) == 1:
        imp_neig = list(imp_neigs.keys())[0]
        wt = imp_neigs[imp_neig]
        wt_edge = wt['weight']          
        node_to_add = imp_neig
        #ADD ALL EDGES OF NEW NODE TO ORIG GRAPH
        with open(folNm+"/"+node_to_add,'rb') as f:
            its_neig_list = pickle_load(f)      
                    
        orig_nodes = g1.nodes()
        all_nodesWedges = set(orig_nodes).intersection(its_neig_list)
        
        for node in all_nodesWedges:
            wt = its_neig_list[node]
            wt_edge = wt['weight']
            g1.add_edge(node_to_add,node,weight=wt_edge)        
        (score_imp_neig,comp_bool) = get_score(g1,model,scaler,inputs['model_type'])
        g1.remove_node(node_to_add)
    else:
        scores = {}
        for neig in imp_neigs:
            # Add to graph 
            wt = imp_neigs[neig]
            wt_edge = wt['weight']          
            node_to_add = neig
            #ADD ALL EDGES OF NEW NODE TO ORIG GRAPH
            with open(folNm+"/"+node_to_add,'rb') as f:
                its_neig_list = pickle_load(f)                              
            orig_nodes = g1.nodes()
            all_nodesWedges = set(orig_nodes).intersection(its_neig_list)
            
            for node in all_nodesWedges:
                wt = its_neig_list[node]
                wt_edge = wt['weight']
                g1.add_edge(node_to_add,node,weight=wt_edge)
            # Check score
            (score_curr,comp_bool) = get_score(g1,model,scaler,inputs['model_type'])         
            scores[neig] = score_curr
            g1.remove_node(node_to_add)
            
            
        imp_neig = max(iter(scores.items()), key=operator_itemgetter(1))[0]  
        score_imp_neig = scores[imp_neig]

    return(imp_neig,score_imp_neig)
Esempio n. 5
0
    def shuffleBAMs(self):
        """Make the data transformation deterministic by reordering the bams"""
        # first we should make a subset of the total data
        # we'd like to take it down to about 1500 or so RI's
        # but we'd like to do this in a repeatable way
        ideal_contig_num = 1500
        sub_cons = range(len(self.indices))
        while len(sub_cons) > ideal_contig_num:
            # select every second contig when sorted by norm cov
            cov_sorted = np_argsort(self.normCoverages[sub_cons])
            sub_cons = np_array([
                sub_cons[cov_sorted[i * 2]]
                for i in np_arange(int(len(sub_cons) / 2))
            ])

            if len(sub_cons) > ideal_contig_num:
                # select every second contig when sorted by mer PC1
                mer_sorted = np_argsort(self.kmerNormPC1[sub_cons])
                sub_cons = np_array([
                    sub_cons[mer_sorted[i * 2]]
                    for i in np_arange(int(len(sub_cons) / 2))
                ])

        # now that we have a subset, calculate the distance between each of the untransformed vectors
        num_sc = len(sub_cons)

        # log shift the coverages towards the origin
        sub_covs = np_transpose([
            self.covProfiles[i] *
            (np_log10(self.normCoverages[i]) / self.normCoverages[i])
            for i in sub_cons
        ])
        sq_dists = cdist(sub_covs, sub_covs, 'cityblock')
        dists = squareform(sq_dists)

        # initialise a list of left, right neighbours
        lr_dict = {}
        for i in range(self.numStoits):
            lr_dict[i] = []

        too_big = 10000
        while True:
            closest = np_argmin(dists)
            if dists[closest] == too_big:
                break
            (i, j) = self.small2indices(closest, self.numStoits - 1)
            lr_dict[j].append(i)
            lr_dict[i].append(j)

            # mark these guys as neighbours
            if len(lr_dict[i]) == 2:
                # no more than 2 neighbours
                sq_dists[i, :] = too_big
                sq_dists[:, i] = too_big
                sq_dists[i, i] = 0.0
            if len(lr_dict[j]) == 2:
                # no more than 2 neighbours
                sq_dists[j, :] = too_big
                sq_dists[:, j] = too_big
                sq_dists[j, j] = 0.0

            # fix the dist matrix
            sq_dists[j, i] = too_big
            sq_dists[i, j] = too_big
            dists = squareform(sq_dists)

        # now make the ordering
        ordering = [0, lr_dict[0][0]]
        done = 2
        while done < self.numStoits:
            last = ordering[done - 1]
            if lr_dict[last][0] == ordering[done - 2]:
                ordering.append(lr_dict[last][1])
                last = lr_dict[last][1]
            else:
                ordering.append(lr_dict[last][0])
                last = lr_dict[last][0]
            done += 1

        # reshuffle the contig order!
        # yay for bubble sort!
        working = np_arange(self.numStoits)
        for i in range(1, self.numStoits):
            # where is this guy in the list
            loc = list(working).index(ordering[i])
            if loc != i:
                # swap the columns
                self.covProfiles[:, [i, loc]] = self.covProfiles[:, [loc, i]]
                self.stoitColNames[[i, loc]] = self.stoitColNames[[loc, i]]
                working[[i, loc]] = working[[loc, i]]
Esempio n. 6
0
    def shuffleBAMs(self):
        """Make the data transformation deterministic by reordering the bams"""
        # first we should make a subset of the total data
        # we'd like to take it down to about 1500 or so RI's
        # but we'd like to do this in a repeatable way
        ideal_contig_num = 1500
        sub_cons = range(len(self.indices))
        while len(sub_cons) > ideal_contig_num:
            # select every second contig when sorted by norm cov
            cov_sorted = np_argsort(self.normCoverages[sub_cons])
            sub_cons = np_array([sub_cons[cov_sorted[i*2]] for i in np_arange(int(len(sub_cons)/2))])

            if len(sub_cons) > ideal_contig_num:
                # select every second contig when sorted by mer PC1
                mer_sorted = np_argsort(self.kmerNormPC1[sub_cons])
                sub_cons = np_array([sub_cons[mer_sorted[i*2]] for i in np_arange(int(len(sub_cons)/2))])

        # now that we have a subset, calculate the distance between each of the untransformed vectors
        num_sc = len(sub_cons)

        # log shift the coverages towards the origin
        sub_covs = np_transpose([self.covProfiles[i]*(np_log10(self.normCoverages[i])/self.normCoverages[i]) for i in sub_cons])
        sq_dists = cdist(sub_covs,sub_covs,'cityblock')
        dists = squareform(sq_dists)

        # initialise a list of left, right neighbours
        lr_dict = {}
        for i in range(self.numStoits):
            lr_dict[i] = []

        too_big = 10000
        while True:
            closest = np_argmin(dists)
            if dists[closest] == too_big:
                break
            (i,j) = self.small2indices(closest, self.numStoits-1)
            lr_dict[j].append(i)
            lr_dict[i].append(j)

            # mark these guys as neighbours
            if len(lr_dict[i]) == 2:
                # no more than 2 neighbours
                sq_dists[i,:] = too_big
                sq_dists[:,i] = too_big
                sq_dists[i,i] = 0.0
            if len(lr_dict[j]) == 2:
                # no more than 2 neighbours
                sq_dists[j,:] = too_big
                sq_dists[:,j] = too_big
                sq_dists[j,j] = 0.0

            # fix the dist matrix
            sq_dists[j,i] = too_big
            sq_dists[i,j] = too_big
            dists = squareform(sq_dists)

        # now make the ordering
        ordering = [0, lr_dict[0][0]]
        done = 2
        while done < self.numStoits:
            last = ordering[done-1]
            if lr_dict[last][0] == ordering[done-2]:
                ordering.append(lr_dict[last][1])
                last = lr_dict[last][1]
            else:
                ordering.append(lr_dict[last][0])
                last = lr_dict[last][0]
            done+=1

        # reshuffle the contig order!
        # yay for bubble sort!
        working = np_arange(self.numStoits)
        for i in range(1, self.numStoits):
            # where is this guy in the list
            loc = list(working).index(ordering[i])
            if loc != i:
                # swap the columns
                self.covProfiles[:,[i,loc]] = self.covProfiles[:,[loc,i]]
                self.stoitColNames[[i,loc]] = self.stoitColNames[[loc,i]]
                working[[i,loc]] = working[[loc,i]]