Example #1
0
def main(fin, fout, fout_weights):
    """
    @param fin: open file for reading interleaved phylip alignment
    @param fout: open file for writing interleaved phylip alignment
    @param fout_weights: open file for writing codon column multiplicities
    """

    # init the list of unique columns
    unique_col_list = []
    col_to_count = defaultdict(int)

    # read the taxon names and the columns
    taxon_names = None
    for col in phylip.read_interleaved_codon_alignment(fin):
        if taxon_names is None:
            taxon_names = col
        else:
            if col not in col_to_count:
                unique_col_list.append(col)
            col_to_count[col] += 1

    # get some output formatting info
    ntaxa = len(taxon_names)
    name_lengths = [len(name) for name in taxon_names]
    ljust_spacing = max(name_lengths + [9])

    # write the interleaved phylip header
    nunique_codon_cols = len(unique_col_list)
    print >> fout, ' %d %d' % (ntaxa, 3 * nunique_codon_cols)

    # write the output files
    ncols_per_paragraph = 15
    offset = 0
    while True:

        # transpose the column list back into a paragraph
        cols = unique_col_list[offset : offset+ncols_per_paragraph]
        if not cols:
            break
        paragraph = zip(*cols)

        # write the weights corresponding to these columns
        if fout_weights is not None:
            weights = [col_to_count[col] for col in cols]
            print >> fout_weights, '\n'.join(str(w) for w in weights)
        
        # write the paragraph
        for i in range(ntaxa):
            row = paragraph[i]
            if offset:
                print >> fout, ''.ljust(ljust_spacing),
            else:
                print >> fout, taxon_names[i].ljust(ljust_spacing),
            print >> fout, ' '.join(row)
        print >> fout
        
        # move the the next paragraph worth of columns
        offset += ncols_per_paragraph
def main(fin, fin_gcode, fin_taxa, fout):
    """
    @param fin: interleaved phylip codon alignment file open for reading
    @param fin_gcode: open file for reading the genetic code
    @param fin_taxa: optional open file for defining taxon subset and order
    @param fout: open file for writing the integer ndarray as text
    """

    # read the description of the genetic code
    arr = list(csv.reader(fin_gcode, delimiter='\t'))
    indices, aminos, codons = zip(*arr)
    if [int(x) for x in indices] != range(len(indices)):
        raise ValueError

    # read the interleaved phylip alignment
    taxon_names = None
    cols = []
    for col in phylip.read_interleaved_codon_alignment(fin):
        if taxon_names is None:
            taxon_names = col
        else:
            cols.append(col)

    # define the ndarray of integers
    M_full = design.get_pattern_array(codons, cols)

    if fin_taxa is None:

        M = M_full

    else:

        # read the ordered taxon subset
        arr = list(csv.reader(fin_taxa, delimiter='\t'))
        indices, requested_taxa = zip(*arr)
        if [int(x) for x in indices] != range(len(indices)):
            raise ValueError

        # init the pattern ndarray with unknown codon states
        M = np.empty((len(cols), len(requested_taxa)), dtype=int)
        M.fill(-1)

        # construct the inverse map of the default taxon ordering
        name_to_phlip_index = dict((x, i) for i, x in enumerate(taxon_names))

        # Redefine the columns according to the user ordering and subsetting.
        # In this code we are pretending to be a database software.
        for i, name in enumerate(requested_taxa):
            phylip_index = name_to_phlip_index.get(name, None)
            if phylip_index is not None:
                M[:, i] = M_full[:, phylip_index]

    # write the ndarray of integers
    np.savetxt(fout, M, fmt='%d', delimiter='\t')