コード例 #1
0
def place_seqs_pipeline(study_fasta, ref_dir, out_tree, threads, out_dir,
                        chunk_size, print_cmds):
    '''Full pipeline for running sequence placement.'''

    # Identify reference files to use.
    ref_msa, tree, hmm, model = identify_ref_files(ref_dir)

    # Run hmmalign to place study sequences into reference MSA.
    out_stockholm = path.join(out_dir, "query_align.stockholm")

    system_call_check("hmmalign  --trim --dna --mapali " + ref_msa +
                      " --informat FASTA -o " + out_stockholm + " " + hmm +
                      " " + study_fasta,
                      print_out=print_cmds)

    hmmalign_out = read_stockholm(out_stockholm, clean_char=True)

    # Specify split FASTA files to be created.
    study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta")
    ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta")

    ref_seqnames = set(list(read_fasta(ref_msa).keys()))

    study_seqnames = set(read_fasta(study_fasta).keys())

    ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames}
    study_hmmalign_subset = {seq: hmmalign_out[seq] for seq in study_seqnames}

    write_fasta(ref_hmmalign_subset, ref_msa_fastafile)
    write_fasta(study_hmmalign_subset, study_msa_fastafile)

    # Run EPA-NG to output .jplace file.
    epa_out_dir = path.join(out_dir, "epa_out")

    run_epa_ng(tree=tree,
               ref_msa_fastafile=ref_msa_fastafile,
               study_msa_fastafile=study_msa_fastafile,
               model=model,
               chunk_size=chunk_size,
               threads=threads,
               out_dir=epa_out_dir,
               print_cmds=print_cmds)

    jplace_outfile = path.join(epa_out_dir, "epa_result_parsed.jplace")

    gappa_jplace_to_newick(jplace_file=jplace_outfile,
                           outfile=out_tree,
                           print_cmds=print_cmds)
コード例 #2
0
def split_ref_study_papara(papara_out: dict, ref_seqnames: set, ref_fasta: str,
                           study_fasta: str):
    '''Split PaPaRa phylip output into FASTA MSA files of study sequences and
    reference sequences separately. Expects PaPaRa output already read in
    as dictionary. Takes in the PaPaRa output as a dictionary, a set that
    contains all sequence ids in reference MSA, and the output FASTA
    filenames.'''

    # Determine study sequence id based on those found in the all and ref sets.
    all_seqnames = set(list(papara_out.keys()))
    study_seqnames = all_seqnames.difference(ref_seqnames)

    # Get subsets of PaPaRa output MSA of reference study sequences only.
    ref_papara_subset = {seq: papara_out[seq] for seq in ref_seqnames}
    study_papara_subset = {seq: papara_out[seq] for seq in study_seqnames}

    write_fasta(ref_papara_subset, ref_fasta)
    write_fasta(study_papara_subset, study_fasta)
コード例 #3
0
    def test_read_write_fasta(self):
        '''Basic test that FASTA files are read and written correctly.'''

        test_seqs_dict = {
            "seq1": "GNATNGAC",
            "seq2": "GTCGTGGC",
            "seq3": "GNCTGAGATTAACC"
        }

        # Write these sequences temp file and then read them back in again.
        with TemporaryDirectory() as temp_dir:
            outfile = path.join(temp_dir, "test.fna")

            write_fasta(test_seqs_dict, outfile)

            test_seqs_dict_in = read_fasta(outfile)

        self.assertEqual(test_seqs_dict, test_seqs_dict_in)
コード例 #4
0
def place_seqs_pipeline(study_fasta, ref_msa, tree, hmm, out_tree,
                        alignment_tool, threads, out_dir, chunk_size,
                        print_cmds):
    '''Full pipeline for running sequence placement.'''

    if alignment_tool == "hmmalign":

        out_stockholm = path.join(out_dir, "query_align.stockholm")

        system_call_check("hmmalign  --trim --dna --mapali " + ref_msa +
                          " --informat FASTA -o " + out_stockholm + " " + hmm +
                          " " + study_fasta,
                          print_out=print_cmds)

        hmmalign_out = read_stockholm(out_stockholm, clean_char=True)

        # Specify split FASTA files to be created.
        study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta")
        ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta")

        ref_seqnames = set(list(read_fasta(ref_msa).keys()))

        study_seqnames = set(read_fasta(study_fasta).keys())

        ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames}
        study_hmmalign_subset = {
            seq: hmmalign_out[seq]
            for seq in study_seqnames
        }

        write_fasta(ref_hmmalign_subset, ref_msa_fastafile)
        write_fasta(study_hmmalign_subset, study_msa_fastafile)

    elif alignment_tool == "papara":

        # Read in ref seqs FASTA as a dict.
        ref_msa = read_fasta(ref_msa)

        # Run PaPaRa to place study sequences and read in Phylip file.
        papara_out = run_papara(tree=tree,
                                ref_msa=ref_msa,
                                study_fasta=study_fasta,
                                out_dir=out_dir,
                                threads=threads,
                                print_cmds=print_cmds)

        # Specify split FASTA files to be created.
        study_msa_fastafile = path.join(out_dir, "study_seqs_papara.fasta")
        ref_msa_fastafile = path.join(out_dir, "ref_seqs_papara.fasta")

        # Split PaPaRa output into two FASTA files containing study and reference
        # sequences respectively.
        split_ref_study_papara(papara_out=papara_out,
                               ref_seqnames=set(list(ref_msa.keys())),
                               study_fasta=study_msa_fastafile,
                               ref_fasta=ref_msa_fastafile)

    # Run EPA-NG to output .jplace file.
    epa_out_dir = path.join(out_dir, "epa_out")

    run_epa_ng(tree=tree,
               ref_msa_fastafile=ref_msa_fastafile,
               study_msa_fastafile=study_msa_fastafile,
               chunk_size=chunk_size,
               threads=threads,
               out_dir=epa_out_dir,
               print_cmds=print_cmds)

    jplace_outfile = path.join(epa_out_dir, "epa_result.jplace")

    gappa_jplace_to_newick(jplace_file=jplace_outfile,
                           outfile=out_tree,
                           print_cmds=print_cmds)
コード例 #5
0
def main():

    parser = argparse.ArgumentParser(
        description="Reads in FASTA file and keeps single sequence (or "
        "possibly no sequence). Works by first screening out all "
        "sequences of length less or greater than the lower and "
        "upper bounds given. Will then screen out sequences with "
        "greater than a set percent of Ns. Then preferentially keeps "
        "sequences with lower proportion of Ns. Finally if there is "
        "still a tie, will just choose a sequence randomly. Output file only "
        "created if there is a final sequence to be written.",
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("-f",
                        "--fasta",
                        metavar="FASTA",
                        type=str,
                        help="Path to FASTA file",
                        required=True)

    parser.add_argument("-l",
                        "--lower_length",
                        metavar="SIZE",
                        type=int,
                        required=True,
                        help="Min length of each seq in FASTA")

    parser.add_argument("-u",
                        "--upper_length",
                        metavar="SIZE",
                        type=int,
                        required=True,
                        help="Max length of each seq in FASTA")

    parser.add_argument("-p",
                        "--prop_n",
                        metavar="SIZE",
                        type=float,
                        required=True,
                        help="Proportion of N characters "
                        "permitted per-sequence")

    parser.add_argument("-o",
                        "--output_dir",
                        metavar="PATH",
                        type=str,
                        required=True,
                        help="Output directory to write final FASTA of single "
                        "sequence IF there is a sequence left to write "
                        "after filtering.")

    parser.add_argument(
        "--rename_seq",
        action="store_true",
        help="Flag to indicate that the sequence header should "
        "be renamed to be the the first 2 fields of the "
        "filenames after delimiting by \'_\'")

    parser.add_argument(
        "--rename_seq_full",
        action="store_true",
        help="Flag to indicate that the sequence header should "
        "be renamed to be the full filename.")

    args = parser.parse_args()

    in_fasta = read_fasta(args.fasta)

    # If no sequences in file then stop job.
    if not in_fasta:
        sys.exit("Stopping - no sequences in file.")

    # Remove all sequences with length outside cut-off range or with greater
    # than the specified proportion of N characters.
    seq2remove = set()

    seq_N_pro = {}

    for seq_id, sequence in in_fasta.items():

        seq_len = len(sequence)

        N_pro = sequence.upper().count("N") / seq_len

        if seq_len < args.lower_length or seq_len > args.upper_length or N_pro > args.prop_n:
            seq2remove.add(seq_id)
        else:
            seq_N_pro[seq_id] = N_pro

    # Remove the specified sequences.
    for seq_id in seq2remove:
        del in_fasta[seq_id]

    # If no sequences in file then stop job.
    if not in_fasta:
        sys.exit("Stopping - no sequences left after filtering.")

    # Of remaining sequences figure out which has the lower proportion of Ns.
    # If there is a tie then the first sequence is taken (since dictionary keys
    # are unordered this results in a random selection).
    best_seq = None
    lowest_pro_N = 1.1

    for seq_id, pro_N in seq_N_pro.items():
        if pro_N < lowest_pro_N:
            best_seq = seq_id

    out_basename = path.splitext(path.basename(args.fasta))[0]

    outfile = path.join(args.output_dir, out_basename + "_best.fna")

    # Add the best sequence to a dictionary so it can be output easily.
    out_seq = {}

    # If rename_seq option set then replace current header with first 2 fields
    # of filename after delimiting by "_".
    if args.rename_seq:
        file_split = out_basename.split("_")
        seqname = file_split[0] + "_" + file_split[1]
    elif args.rename_seq_full:
        seqname = out_basename
    else:
        seqname = best_seq

    out_seq[seqname] = in_fasta[best_seq].upper()

    write_fasta(out_seq, outfile)
コード例 #6
0
def place_seqs_pipeline(study_fasta,
                        ref_dir,
                        out_tree,
                        threads,
                        out_dir,
                        min_align,
                        chunk_size,
                        verbose):
    '''Full pipeline for running sequence placement.'''

    # Throw error if there is a space in the study FASTA filepath.
    if " " in study_fasta:
        sys.exit("Stopping - remove the space from the input FASTA filepath.")

    # Identify reference files to use.
    ref_msa, tree, hmm, model = identify_ref_files(ref_dir)

    # Run hmmalign to place study sequences into reference MSA.
    out_stockholm = path.join(out_dir, "query_align.stockholm")

    system_call_check("hmmalign --trim --dna --mapali " +
                      ref_msa + " --informat FASTA -o " +
                      out_stockholm + " " + hmm + " " + study_fasta,
                      print_command=verbose, print_stdout=verbose,
                      print_stderr=verbose)

    hmmalign_out = read_stockholm(out_stockholm, clean_char=True)

    # Specify split FASTA files to be created.
    study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta")
    ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta")

    ref_seqnames = set(list(read_fasta(ref_msa).keys()))
    study_seqs = read_fasta(study_fasta)

    ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames}

    study_hmmalign_subset = check_alignments(raw_seqs=study_seqs,
                                             aligned_seqs=hmmalign_out,
                                             min_align=min_align,
                                             verbose=verbose)

    write_fasta(ref_hmmalign_subset, ref_msa_fastafile)
    write_fasta(study_hmmalign_subset, study_msa_fastafile)

    # Run EPA-ng to place input sequences and output JPLACE file.
    epa_out_dir = path.join(out_dir, "epa_out")

    run_epa_ng(tree=tree,
               ref_msa_fastafile=ref_msa_fastafile,
               study_msa_fastafile=study_msa_fastafile,
               model=model,
               chunk_size=chunk_size,
               threads=threads,
               out_dir=epa_out_dir,
               print_cmds=verbose)

    jplace_outfile = path.join(epa_out_dir, "epa_result_parsed.jplace")

    gappa_jplace_to_newick(jplace_file=jplace_outfile,
                           outfile=out_tree,
                           print_cmds=verbose)