def filter_by_occupancy(filename_in, filename_out, min_occupancy=0.5): """ Filter an alignment in fasta format according to the occupancy of the columns. Store the results in fasta format. """ fasta_raw = fasta_to_dict(filename_in) n_sequences = len(fasta_raw.keys()) alignment_length = len(fasta_raw[tuple(fasta_raw.keys())[0]]) columns = tuple("".join(fasta_raw[seqname][column_index] for seqname in fasta_raw.keys()) for column_index in range(alignment_length)) columns_to_keep = [] for column_number, column in enumerate(columns): n_gaps = column.count("-") if 1 - float(n_gaps) / float(n_sequences) >= min_occupancy: columns_to_keep.append(column_number) fasta_trimmed = {} for seqname, sequence in fasta_raw.items(): fasta_trimmed[seqname] = "".join(fasta_raw[seqname][column_to_keep] for column_to_keep in columns_to_keep) if not os.path.exists(os.path.dirname(filename_out)): os.makedirs(os.path.dirname(filename_out)) with open(filename_out, "w") as f_out: for seqname, sequence in fasta_trimmed.items(): f_out.write(">{seqname}\n{sequence}\n".format(seqname=seqname, sequence=sequence))
def subset_file(pep_fn, cds_fn, cds_dict): """Write to cds_fn the cds sequences that are in pep_fn""" with open(cds_fn, "w") as cds_out: for seqid in fasta_to_dict(pep_fn).keys(): cds_out.write( ">{seqid}\n{sequence}\n".format(seqid=seqid, sequence=cds_dict[seqid]) )
def remove_gaps(fasta_in, fasta_out): """Remove all gaps in the alignment""" with open(fasta_out, "w") as f_out: for name, sequence in fasta_to_dict(fasta_in).items(): f_out.write(f">{name}\n{sequence.replace('-', '')}\n")
def translate_fasta(fasta_in, fasta_out): """Translate an entire fasta aligment""" with open(fasta_out, "w") as f_out: for name, sequence in fasta_to_dict(fasta_in).items(): f_out.write(f">{name}\n{translate(sequence)}\n")
"""Write to cds_fn the cds sequences that are in pep_fn""" with open(cds_fn, "w") as cds_out: for seqid in fasta_to_dict(pep_fn).keys(): cds_out.write( ">{seqid}\n{sequence}\n".format(seqid=seqid, sequence=cds_dict[seqid]) ) if __name__ == "__main__": if len(sys.argv) != 6: sys.stderr.write( "Error. Usage: python split_cds.py folder_in ext_in folder_out " "ext_out all_cds.fa" ) sys.exit(1) IN_DIR = fix_dir_path(sys.argv[1]) IN_EXT = sys.argv[2] OUT_DIR = fix_dir_path(sys.argv[3]) OUT_EXT = sys.argv[4] CDS_FN = sys.argv[5] CDS_DICT = fasta_to_dict(CDS_FN) def process(pep_fn, cds_fn): """Fix parameter""" return subset_file(pep_fn, cds_fn, CDS_DICT) process_folders(IN_DIR, IN_EXT, OUT_DIR, OUT_EXT, process)