def merge_columns(self, groups, grouping_criterion): """ Returns a new ArrayTable object in which columns are merged according to a given criterion. 'groups' argument must be a dictionary in which keys are the new column names, and each value is the list of current column names to be merged. 'grouping_criterion' must be 'min', 'max' or 'mean', and defines how numeric values will be merged. Example: my_groups = {'NewColumn':['column5', 'column6']} new_Array = Array.merge_columns(my_groups, 'max') """ if grouping_criterion == "max": grouping_f = get_max_vector elif grouping_criterion == "min": grouping_f = get_min_vector elif grouping_criterion == "mean": grouping_f = get_mean_vector else: raise ValueError, "grouping_criterion not supported. Use max|min|mean " grouped_array = self.__class__() grouped_matrix = [] colNames = [] alltnames = set([]) for gname, tnames in groups.iteritems(): all_vectors = [] for tn in tnames: if tn not in self.colValues: raise ValueError, str(tn) + " column not found." if tn in alltnames: raise ValueError, str( tn) + " duplicated column name for merging" alltnames.add(tn) vector = self.get_column_vector(tn).astype(float) all_vectors.append(vector) # Store the group vector = max expression of all items in group grouped_matrix.append(grouping_f(all_vectors)) # store group name colNames.append(gname) for cname in self.colNames: if cname not in alltnames: grouped_matrix.append(self.get_column_vector(cname)) colNames.append(cname) grouped_array.rowNames = self.rowNames grouped_array.colNames = colNames vmatrix = numpy.array(grouped_matrix).transpose() grouped_array._link_names2matrix(vmatrix) return grouped_array
def merge_columns(self, groups, grouping_criterion): """ Returns a new ArrayTable object in which columns are merged according to a given criterion. 'groups' argument must be a dictionary in which keys are the new column names, and each value is the list of current column names to be merged. 'grouping_criterion' must be 'min', 'max' or 'mean', and defines how numeric values will be merged. Example: my_groups = {'NewColumn':['column5', 'column6']} new_Array = Array.merge_columns(my_groups, 'max') """ if grouping_criterion == "max": grouping_f = get_max_vector elif grouping_criterion == "min": grouping_f = get_min_vector elif grouping_criterion == "mean": grouping_f = get_mean_vector else: raise ValueError, "grouping_criterion not supported. Use max|min|mean " grouped_array = self.__class__() grouped_matrix = [] colNames = [] alltnames = set([]) for gname,tnames in groups.iteritems(): all_vectors=[] for tn in tnames: if tn not in self.colValues: raise ValueError, str(tn)+" column not found." if tn in alltnames: raise ValueError, str(tn)+" duplicated column name for merging" alltnames.add(tn) vector = self.get_column_vector(tn).astype(float) all_vectors.append(vector) # Store the group vector = max expression of all items in group grouped_matrix.append(grouping_f(all_vectors)) # store group name colNames.append(gname) for cname in self.colNames: if cname not in alltnames: grouped_matrix.append(self.get_column_vector(cname)) colNames.append(cname) grouped_array.rowNames= self.rowNames grouped_array.colNames= colNames vmatrix = numpy.array(grouped_matrix).transpose() grouped_array._link_names2matrix(vmatrix) return grouped_array
def safe_mean_vector(vectors): """ Returns mean profile discarding non finite values. """ # if only one vector, avg = itself if len(vectors)==1: return vectors[0], numpy.zeros(len(vectors[0])) # Takes the vector length form the first item length = len(vectors[0]) safe_mean = [] safe_std = [] for pos in xrange(length): pos_mean = [] for v in vectors: if numpy.isfinite(v[pos]): pos_mean.append(v[pos]) safe_mean.append(numpy.mean(pos_mean)) safe_std.append(numpy.std(pos_mean)) return numpy.array(safe_mean), numpy.array(safe_std)
def safe_mean_vector(vectors): """ Returns mean profile discarding non finite values. """ # if only one vector, avg = itself if len(vectors) == 1: return vectors[0], numpy.zeros(len(vectors[0])) # Takes the vector length form the first item length = len(vectors[0]) safe_mean = [] safe_std = [] for pos in xrange(length): pos_mean = [] for v in vectors: if numpy.isfinite(v[pos]): pos_mean.append(v[pos]) safe_mean.append(numpy.mean(pos_mean)) safe_std.append(numpy.std(pos_mean)) return numpy.array(safe_mean), numpy.array(safe_std)
def read_arraytable(matrix_file, mtype="float", arraytable_object = None): """ Reads a text tab-delimited matrix from file """ if arraytable_object is None: from ete2.coretype import arraytable A = arraytable.ArrayTable() else: A = arraytable_object A.mtype = mtype temp_matrix = [] rowname_counter = {} colname_counter = {} row_dup_flag = False col_dup_flag = False # if matrix_file has many lines, tries to read it as the matrix # itself. if len(matrix_file.split("\n"))>1: matrix_data = matrix_file.split("\n") else: matrix_data = open(matrix_file) for line in matrix_data: # Clean up line line = line.strip("\n") #line = line.replace(" ","") # Skip empty lines if not line: continue # Get fields in line fields = line.split("\t") # Read column names if line[0]=='#' and re.match("#NAMES",fields[0],re.IGNORECASE): counter = 0 for colname in fields[1:]: colname = colname.strip() # Handle duplicated col names by adding a number colname_counter[colname] = colname_counter.get(colname,0) + 1 if colname in A.colValues: colname += "_%d" % colname_counter[colname] col_dup_flag = True # Adds colname A.colValues[colname] = None A.colNames.append(colname) if col_dup_flag: print >>stderr, "Duplicated column names were renamed." # Skip comments elif line[0]=='#': continue # Read values (only when column names are loaded) elif A.colNames: # Checks shape if len(fields)-1 != len(A.colNames): raise ValueError, "Invalid number of columns. Expecting:%d" % len(A.colNames) # Extracts row name and remove it from fields rowname = fields.pop(0).strip() # Handles duplicated row names by adding a number rowname_counter[rowname] = rowname_counter.get(rowname,0) + 1 if rowname in A.rowValues: rowname += "_%d" % rowname_counter[rowname] row_dup_names = True # Adds row name A.rowValues[rowname] = None A.rowNames.append(rowname) # Reads row values values = [] for f in fields: if f.strip()=="": f = numpy.nan values.append(f) temp_matrix.append(values) else: raise ValueError, "Column names are required." if row_dup_flag: print >>stderr, "Duplicated row names were renamed." # Convert all read lines into a numpy matrix vmatrix = numpy.array(temp_matrix).astype(A.mtype) # Updates indexes to link names and vectors in matrix A._link_names2matrix(vmatrix) return A
def get_several_row_vectors(self, rownames): """ Returns a list vectors associated to several row names """ vectors = [self.rowValues[rname] for rname in rownames] return numpy.array(vectors)
def get_several_column_vectors(self, colnames): """ Returns a list of vectors associated to several column names """ vectors = [self.colValues[cname] for cname in colnames] return numpy.array(vectors)
def get_min_vector(vlist): a = numpy.array(vlist) return numpy.min(a, 0)
def get_max_vector(vlist): a = numpy.array(vlist) return numpy.max(a, 0)
def get_median_vector(vlist): a = numpy.array(vlist) return numpy.median(a)
def get_several_row_vectors(self,rownames): """ Returns a list vectors associated to several row names """ vectors = [self.rowValues[rname] for rname in rownames] return numpy.array(vectors)
def get_several_column_vectors(self,colnames): """ Returns a list of vectors associated to several column names """ vectors = [self.colValues[cname] for cname in colnames] return numpy.array(vectors)
def get_min_vector(vlist): a = numpy.array(vlist) return numpy.min(a,0)
def get_max_vector(vlist): a = numpy.array(vlist) return numpy.max(a,0)