Ejemplo n.º 1
0
    def merge_columns(self, groups, grouping_criterion):
        """ Returns a new ArrayTable object in which columns are
        merged according to a given criterion.

        'groups' argument must be a dictionary in which keys are the
        new column names, and each value is the list of current
        column names to be merged.

        'grouping_criterion' must be 'min', 'max' or 'mean', and
        defines how numeric values will be merged.

        Example:
           my_groups = {'NewColumn':['column5', 'column6']}
           new_Array = Array.merge_columns(my_groups, 'max')

        """

        if grouping_criterion == "max":
            grouping_f = get_max_vector
        elif grouping_criterion == "min":
            grouping_f = get_min_vector
        elif grouping_criterion == "mean":
            grouping_f = get_mean_vector
        else:
            raise ValueError, "grouping_criterion not supported. Use max|min|mean "

        grouped_array = self.__class__()
        grouped_matrix = []
        colNames = []
        alltnames = set([])
        for gname, tnames in groups.iteritems():
            all_vectors = []
            for tn in tnames:
                if tn not in self.colValues:
                    raise ValueError, str(tn) + " column not found."
                if tn in alltnames:
                    raise ValueError, str(
                        tn) + " duplicated column name for merging"
                alltnames.add(tn)
                vector = self.get_column_vector(tn).astype(float)
                all_vectors.append(vector)
            # Store the group vector = max expression of all items in group
            grouped_matrix.append(grouping_f(all_vectors))
            # store group name
            colNames.append(gname)

        for cname in self.colNames:
            if cname not in alltnames:
                grouped_matrix.append(self.get_column_vector(cname))
                colNames.append(cname)

        grouped_array.rowNames = self.rowNames
        grouped_array.colNames = colNames
        vmatrix = numpy.array(grouped_matrix).transpose()
        grouped_array._link_names2matrix(vmatrix)
        return grouped_array
Ejemplo n.º 2
0
    def merge_columns(self, groups, grouping_criterion):
        """ Returns a new ArrayTable object in which columns are
        merged according to a given criterion.

        'groups' argument must be a dictionary in which keys are the
        new column names, and each value is the list of current
        column names to be merged.

        'grouping_criterion' must be 'min', 'max' or 'mean', and
        defines how numeric values will be merged.

        Example:
           my_groups = {'NewColumn':['column5', 'column6']}
           new_Array = Array.merge_columns(my_groups, 'max')

        """

        if grouping_criterion == "max":
            grouping_f = get_max_vector
        elif grouping_criterion == "min":
            grouping_f = get_min_vector
        elif grouping_criterion == "mean":
            grouping_f = get_mean_vector
        else:
            raise ValueError, "grouping_criterion not supported. Use max|min|mean "

        grouped_array = self.__class__()
        grouped_matrix = []
        colNames = []
        alltnames = set([])
        for gname,tnames in groups.iteritems():
            all_vectors=[]
            for tn in tnames:
                if tn not in self.colValues:
                    raise ValueError, str(tn)+" column not found."
                if tn in alltnames:
                    raise ValueError, str(tn)+" duplicated column name for merging"
                alltnames.add(tn)
                vector = self.get_column_vector(tn).astype(float)
                all_vectors.append(vector)
            # Store the group vector = max expression of all items in group
            grouped_matrix.append(grouping_f(all_vectors))
            # store group name
            colNames.append(gname)

        for cname in self.colNames:
            if cname not in alltnames:
                grouped_matrix.append(self.get_column_vector(cname))
                colNames.append(cname)

        grouped_array.rowNames= self.rowNames
        grouped_array.colNames= colNames
        vmatrix = numpy.array(grouped_matrix).transpose()
        grouped_array._link_names2matrix(vmatrix)
        return grouped_array
Ejemplo n.º 3
0
def safe_mean_vector(vectors):
    """ Returns mean profile discarding non finite values.
    """
    # if only one vector, avg = itself
    if len(vectors)==1:
        return vectors[0], numpy.zeros(len(vectors[0]))
    # Takes the vector length form the first item
    length = len(vectors[0])

    safe_mean = []
    safe_std  = []

    for pos in xrange(length):
        pos_mean = []
        for v in vectors:
            if numpy.isfinite(v[pos]):
                pos_mean.append(v[pos])
        safe_mean.append(numpy.mean(pos_mean))
        safe_std.append(numpy.std(pos_mean))
    return numpy.array(safe_mean), numpy.array(safe_std)
Ejemplo n.º 4
0
def safe_mean_vector(vectors):
    """ Returns mean profile discarding non finite values.
    """
    # if only one vector, avg = itself
    if len(vectors) == 1:
        return vectors[0], numpy.zeros(len(vectors[0]))
    # Takes the vector length form the first item
    length = len(vectors[0])

    safe_mean = []
    safe_std = []

    for pos in xrange(length):
        pos_mean = []
        for v in vectors:
            if numpy.isfinite(v[pos]):
                pos_mean.append(v[pos])
        safe_mean.append(numpy.mean(pos_mean))
        safe_std.append(numpy.std(pos_mean))
    return numpy.array(safe_mean), numpy.array(safe_std)
Ejemplo n.º 5
0
def read_arraytable(matrix_file, mtype="float", arraytable_object = None):
    """ Reads a text tab-delimited matrix from file """

    if arraytable_object is None:
        from ete2.coretype import arraytable
        A = arraytable.ArrayTable()
    else:
        A = arraytable_object

    A.mtype          = mtype
    temp_matrix         = []
    rowname_counter     = {}
    colname_counter     = {}
    row_dup_flag = False
    col_dup_flag = False

    # if matrix_file has many lines, tries to read it as the matrix
    # itself.
    if len(matrix_file.split("\n"))>1:
        matrix_data = matrix_file.split("\n")
    else:
        matrix_data = open(matrix_file)

    for line in matrix_data:
        # Clean up line
        line = line.strip("\n")
        #line = line.replace(" ","")
        # Skip empty lines
        if not line:
            continue
        # Get fields in line
        fields = line.split("\t")
        # Read column names
        if line[0]=='#' and re.match("#NAMES",fields[0],re.IGNORECASE):
            counter = 0
            for colname in fields[1:]:
                colname = colname.strip()

                # Handle duplicated col names by adding a number
                colname_counter[colname] = colname_counter.get(colname,0) + 1
                if colname in A.colValues:
                    colname += "_%d" % colname_counter[colname]
                    col_dup_flag = True
                # Adds colname
                A.colValues[colname] = None
                A.colNames.append(colname)
            if col_dup_flag:
                print >>stderr, "Duplicated column names were renamed."

        # Skip comments
        elif line[0]=='#':
            continue

        # Read values (only when column names are loaded)
        elif A.colNames:
            # Checks shape
            if len(fields)-1 != len(A.colNames):
                raise ValueError, "Invalid number of columns. Expecting:%d" % len(A.colNames)

            # Extracts row name and remove it from fields
            rowname  = fields.pop(0).strip()

            # Handles duplicated row names by adding a number
            rowname_counter[rowname] = rowname_counter.get(rowname,0) + 1
            if rowname in A.rowValues:
                rowname += "_%d" % rowname_counter[rowname]
                row_dup_names = True

            # Adds row name
            A.rowValues[rowname] = None
            A.rowNames.append(rowname)

            # Reads row values
            values = []
            for f in fields:
                if f.strip()=="":
                    f = numpy.nan
                values.append(f)
            temp_matrix.append(values)
        else:
            raise ValueError, "Column names are required."

    if row_dup_flag:
        print >>stderr, "Duplicated row names were renamed."

    # Convert all read lines into a numpy matrix
    vmatrix = numpy.array(temp_matrix).astype(A.mtype)

    # Updates indexes to link names and vectors in matrix
    A._link_names2matrix(vmatrix)
    return A
Ejemplo n.º 6
0
 def get_several_row_vectors(self, rownames):
     """ Returns a list vectors associated to several row names """
     vectors = [self.rowValues[rname] for rname in rownames]
     return numpy.array(vectors)
Ejemplo n.º 7
0
 def get_several_column_vectors(self, colnames):
     """ Returns a list of vectors associated to several column names """
     vectors = [self.colValues[cname] for cname in colnames]
     return numpy.array(vectors)
Ejemplo n.º 8
0
def get_min_vector(vlist):
    a = numpy.array(vlist)
    return numpy.min(a, 0)
Ejemplo n.º 9
0
def get_max_vector(vlist):
    a = numpy.array(vlist)
    return numpy.max(a, 0)
Ejemplo n.º 10
0
def get_median_vector(vlist):
    a = numpy.array(vlist)
    return numpy.median(a)
Ejemplo n.º 11
0
 def get_several_row_vectors(self,rownames):
     """ Returns a list vectors associated to several row names """
     vectors = [self.rowValues[rname] for rname in rownames]
     return numpy.array(vectors)
Ejemplo n.º 12
0
 def get_several_column_vectors(self,colnames):
     """ Returns a list of vectors associated to several column names """
     vectors = [self.colValues[cname] for cname in colnames]
     return numpy.array(vectors)
Ejemplo n.º 13
0
def get_min_vector(vlist):
    a = numpy.array(vlist)
    return numpy.min(a,0)
Ejemplo n.º 14
0
def get_max_vector(vlist):
    a = numpy.array(vlist)
    return numpy.max(a,0)
Ejemplo n.º 15
0
def get_median_vector(vlist):
    a = numpy.array(vlist)
    return numpy.median(a)