def place_seqs_pipeline(study_fasta, ref_dir, out_tree, threads, out_dir, chunk_size, print_cmds): '''Full pipeline for running sequence placement.''' # Identify reference files to use. ref_msa, tree, hmm, model = identify_ref_files(ref_dir) # Run hmmalign to place study sequences into reference MSA. out_stockholm = path.join(out_dir, "query_align.stockholm") system_call_check("hmmalign --trim --dna --mapali " + ref_msa + " --informat FASTA -o " + out_stockholm + " " + hmm + " " + study_fasta, print_out=print_cmds) hmmalign_out = read_stockholm(out_stockholm, clean_char=True) # Specify split FASTA files to be created. study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta") ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta") ref_seqnames = set(list(read_fasta(ref_msa).keys())) study_seqnames = set(read_fasta(study_fasta).keys()) ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames} study_hmmalign_subset = {seq: hmmalign_out[seq] for seq in study_seqnames} write_fasta(ref_hmmalign_subset, ref_msa_fastafile) write_fasta(study_hmmalign_subset, study_msa_fastafile) # Run EPA-NG to output .jplace file. epa_out_dir = path.join(out_dir, "epa_out") run_epa_ng(tree=tree, ref_msa_fastafile=ref_msa_fastafile, study_msa_fastafile=study_msa_fastafile, model=model, chunk_size=chunk_size, threads=threads, out_dir=epa_out_dir, print_cmds=print_cmds) jplace_outfile = path.join(epa_out_dir, "epa_result_parsed.jplace") gappa_jplace_to_newick(jplace_file=jplace_outfile, outfile=out_tree, print_cmds=print_cmds)
def split_ref_study_papara(papara_out: dict, ref_seqnames: set, ref_fasta: str, study_fasta: str): '''Split PaPaRa phylip output into FASTA MSA files of study sequences and reference sequences separately. Expects PaPaRa output already read in as dictionary. Takes in the PaPaRa output as a dictionary, a set that contains all sequence ids in reference MSA, and the output FASTA filenames.''' # Determine study sequence id based on those found in the all and ref sets. all_seqnames = set(list(papara_out.keys())) study_seqnames = all_seqnames.difference(ref_seqnames) # Get subsets of PaPaRa output MSA of reference study sequences only. ref_papara_subset = {seq: papara_out[seq] for seq in ref_seqnames} study_papara_subset = {seq: papara_out[seq] for seq in study_seqnames} write_fasta(ref_papara_subset, ref_fasta) write_fasta(study_papara_subset, study_fasta)
def test_read_write_fasta(self): '''Basic test that FASTA files are read and written correctly.''' test_seqs_dict = { "seq1": "GNATNGAC", "seq2": "GTCGTGGC", "seq3": "GNCTGAGATTAACC" } # Write these sequences temp file and then read them back in again. with TemporaryDirectory() as temp_dir: outfile = path.join(temp_dir, "test.fna") write_fasta(test_seqs_dict, outfile) test_seqs_dict_in = read_fasta(outfile) self.assertEqual(test_seqs_dict, test_seqs_dict_in)
def place_seqs_pipeline(study_fasta, ref_msa, tree, hmm, out_tree, alignment_tool, threads, out_dir, chunk_size, print_cmds): '''Full pipeline for running sequence placement.''' if alignment_tool == "hmmalign": out_stockholm = path.join(out_dir, "query_align.stockholm") system_call_check("hmmalign --trim --dna --mapali " + ref_msa + " --informat FASTA -o " + out_stockholm + " " + hmm + " " + study_fasta, print_out=print_cmds) hmmalign_out = read_stockholm(out_stockholm, clean_char=True) # Specify split FASTA files to be created. study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta") ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta") ref_seqnames = set(list(read_fasta(ref_msa).keys())) study_seqnames = set(read_fasta(study_fasta).keys()) ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames} study_hmmalign_subset = { seq: hmmalign_out[seq] for seq in study_seqnames } write_fasta(ref_hmmalign_subset, ref_msa_fastafile) write_fasta(study_hmmalign_subset, study_msa_fastafile) elif alignment_tool == "papara": # Read in ref seqs FASTA as a dict. ref_msa = read_fasta(ref_msa) # Run PaPaRa to place study sequences and read in Phylip file. papara_out = run_papara(tree=tree, ref_msa=ref_msa, study_fasta=study_fasta, out_dir=out_dir, threads=threads, print_cmds=print_cmds) # Specify split FASTA files to be created. study_msa_fastafile = path.join(out_dir, "study_seqs_papara.fasta") ref_msa_fastafile = path.join(out_dir, "ref_seqs_papara.fasta") # Split PaPaRa output into two FASTA files containing study and reference # sequences respectively. split_ref_study_papara(papara_out=papara_out, ref_seqnames=set(list(ref_msa.keys())), study_fasta=study_msa_fastafile, ref_fasta=ref_msa_fastafile) # Run EPA-NG to output .jplace file. epa_out_dir = path.join(out_dir, "epa_out") run_epa_ng(tree=tree, ref_msa_fastafile=ref_msa_fastafile, study_msa_fastafile=study_msa_fastafile, chunk_size=chunk_size, threads=threads, out_dir=epa_out_dir, print_cmds=print_cmds) jplace_outfile = path.join(epa_out_dir, "epa_result.jplace") gappa_jplace_to_newick(jplace_file=jplace_outfile, outfile=out_tree, print_cmds=print_cmds)
def main(): parser = argparse.ArgumentParser( description="Reads in FASTA file and keeps single sequence (or " "possibly no sequence). Works by first screening out all " "sequences of length less or greater than the lower and " "upper bounds given. Will then screen out sequences with " "greater than a set percent of Ns. Then preferentially keeps " "sequences with lower proportion of Ns. Finally if there is " "still a tie, will just choose a sequence randomly. Output file only " "created if there is a final sequence to be written.", formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-f", "--fasta", metavar="FASTA", type=str, help="Path to FASTA file", required=True) parser.add_argument("-l", "--lower_length", metavar="SIZE", type=int, required=True, help="Min length of each seq in FASTA") parser.add_argument("-u", "--upper_length", metavar="SIZE", type=int, required=True, help="Max length of each seq in FASTA") parser.add_argument("-p", "--prop_n", metavar="SIZE", type=float, required=True, help="Proportion of N characters " "permitted per-sequence") parser.add_argument("-o", "--output_dir", metavar="PATH", type=str, required=True, help="Output directory to write final FASTA of single " "sequence IF there is a sequence left to write " "after filtering.") parser.add_argument( "--rename_seq", action="store_true", help="Flag to indicate that the sequence header should " "be renamed to be the the first 2 fields of the " "filenames after delimiting by \'_\'") parser.add_argument( "--rename_seq_full", action="store_true", help="Flag to indicate that the sequence header should " "be renamed to be the full filename.") args = parser.parse_args() in_fasta = read_fasta(args.fasta) # If no sequences in file then stop job. if not in_fasta: sys.exit("Stopping - no sequences in file.") # Remove all sequences with length outside cut-off range or with greater # than the specified proportion of N characters. seq2remove = set() seq_N_pro = {} for seq_id, sequence in in_fasta.items(): seq_len = len(sequence) N_pro = sequence.upper().count("N") / seq_len if seq_len < args.lower_length or seq_len > args.upper_length or N_pro > args.prop_n: seq2remove.add(seq_id) else: seq_N_pro[seq_id] = N_pro # Remove the specified sequences. for seq_id in seq2remove: del in_fasta[seq_id] # If no sequences in file then stop job. if not in_fasta: sys.exit("Stopping - no sequences left after filtering.") # Of remaining sequences figure out which has the lower proportion of Ns. # If there is a tie then the first sequence is taken (since dictionary keys # are unordered this results in a random selection). best_seq = None lowest_pro_N = 1.1 for seq_id, pro_N in seq_N_pro.items(): if pro_N < lowest_pro_N: best_seq = seq_id out_basename = path.splitext(path.basename(args.fasta))[0] outfile = path.join(args.output_dir, out_basename + "_best.fna") # Add the best sequence to a dictionary so it can be output easily. out_seq = {} # If rename_seq option set then replace current header with first 2 fields # of filename after delimiting by "_". if args.rename_seq: file_split = out_basename.split("_") seqname = file_split[0] + "_" + file_split[1] elif args.rename_seq_full: seqname = out_basename else: seqname = best_seq out_seq[seqname] = in_fasta[best_seq].upper() write_fasta(out_seq, outfile)
def place_seqs_pipeline(study_fasta, ref_dir, out_tree, threads, out_dir, min_align, chunk_size, verbose): '''Full pipeline for running sequence placement.''' # Throw error if there is a space in the study FASTA filepath. if " " in study_fasta: sys.exit("Stopping - remove the space from the input FASTA filepath.") # Identify reference files to use. ref_msa, tree, hmm, model = identify_ref_files(ref_dir) # Run hmmalign to place study sequences into reference MSA. out_stockholm = path.join(out_dir, "query_align.stockholm") system_call_check("hmmalign --trim --dna --mapali " + ref_msa + " --informat FASTA -o " + out_stockholm + " " + hmm + " " + study_fasta, print_command=verbose, print_stdout=verbose, print_stderr=verbose) hmmalign_out = read_stockholm(out_stockholm, clean_char=True) # Specify split FASTA files to be created. study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta") ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta") ref_seqnames = set(list(read_fasta(ref_msa).keys())) study_seqs = read_fasta(study_fasta) ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames} study_hmmalign_subset = check_alignments(raw_seqs=study_seqs, aligned_seqs=hmmalign_out, min_align=min_align, verbose=verbose) write_fasta(ref_hmmalign_subset, ref_msa_fastafile) write_fasta(study_hmmalign_subset, study_msa_fastafile) # Run EPA-ng to place input sequences and output JPLACE file. epa_out_dir = path.join(out_dir, "epa_out") run_epa_ng(tree=tree, ref_msa_fastafile=ref_msa_fastafile, study_msa_fastafile=study_msa_fastafile, model=model, chunk_size=chunk_size, threads=threads, out_dir=epa_out_dir, print_cmds=verbose) jplace_outfile = path.join(epa_out_dir, "epa_result_parsed.jplace") gappa_jplace_to_newick(jplace_file=jplace_outfile, outfile=out_tree, print_cmds=verbose)