def test_split_ref_study_papara(self):
        '''Basic test for split_ref_study_papara function.'''

        # Read in PaPaRa output.
        papara_out = read_phylip(exp_papara_phylip)

        # Read in expected output files.
        exp_fasta = [read_fasta(exp_ref_fasta), read_fasta(exp_study_fasta)]

        with TemporaryDirectory() as temp_dir:
            out_ref_fasta = path.join(temp_dir, "ref_test.fna")
            out_study_fasta = path.join(temp_dir, "study_test.fna")

            split_ref_study_papara(papara_out=papara_out,
                                   ref_seqnames=set(exp_fasta[0].keys()),
                                   ref_fasta=out_ref_fasta,
                                   study_fasta=out_study_fasta)

            # Read in output files.
            obs_fasta = [
                read_fasta(out_ref_fasta),
                read_fasta(out_study_fasta)
            ]

        self.assertEqual(exp_fasta, obs_fasta)
Exemple #2
0
def place_seqs_pipeline(study_fasta, ref_dir, out_tree, threads, out_dir,
                        chunk_size, print_cmds):
    '''Full pipeline for running sequence placement.'''

    # Identify reference files to use.
    ref_msa, tree, hmm, model = identify_ref_files(ref_dir)

    # Run hmmalign to place study sequences into reference MSA.
    out_stockholm = path.join(out_dir, "query_align.stockholm")

    system_call_check("hmmalign  --trim --dna --mapali " + ref_msa +
                      " --informat FASTA -o " + out_stockholm + " " + hmm +
                      " " + study_fasta,
                      print_out=print_cmds)

    hmmalign_out = read_stockholm(out_stockholm, clean_char=True)

    # Specify split FASTA files to be created.
    study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta")
    ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta")

    ref_seqnames = set(list(read_fasta(ref_msa).keys()))

    study_seqnames = set(read_fasta(study_fasta).keys())

    ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames}
    study_hmmalign_subset = {seq: hmmalign_out[seq] for seq in study_seqnames}

    write_fasta(ref_hmmalign_subset, ref_msa_fastafile)
    write_fasta(study_hmmalign_subset, study_msa_fastafile)

    # Run EPA-NG to output .jplace file.
    epa_out_dir = path.join(out_dir, "epa_out")

    run_epa_ng(tree=tree,
               ref_msa_fastafile=ref_msa_fastafile,
               study_msa_fastafile=study_msa_fastafile,
               model=model,
               chunk_size=chunk_size,
               threads=threads,
               out_dir=epa_out_dir,
               print_cmds=print_cmds)

    jplace_outfile = path.join(epa_out_dir, "epa_result_parsed.jplace")

    gappa_jplace_to_newick(jplace_file=jplace_outfile,
                           outfile=out_tree,
                           print_cmds=print_cmds)
def check_overlapping_seqs(in_seq, in_tab, verbose):
    '''Check that ASV ids overlap between the input FASTA and sequence
    abundance table. Will throw an error if none overlap and will otherwise
    print number of overlapping ids to STDERR. Also throw warning if input
    ASV table contains a column called taxonomy'''

    FASTA_ASVs = set(read_fasta(in_seq).keys())

    in_table = read_seqabun(in_tab)

    table_ASVs = set(in_table.index.values)

    num_ASV_overlap = len(table_ASVs.intersection(FASTA_ASVs))

    if 'taxonomy' in in_table.columns:
        print(
            "Warning - column named \"taxonomy\" in abundance table - if "
            "this corresponds to taxonomic labels this should be removed "
            "before running this pipeline.",
            file=sys.stderr)

    # Throw error if 0 ASVs overlap between the two files.
    if num_ASV_overlap == 0:
        sys.exit("Stopping - no ASV ids overlap between input FASTA and "
                 "sequence abundance table")

    # Otherwise print to STDERR how many ASVs overlap between the two files
    # if verbose set.
    if verbose:
        print(str(num_ASV_overlap) + " of " + str(len(table_ASVs)) +
              " sequence ids overlap between input table and FASTA.\n",
              file=sys.stderr)
    def test_check_alignment(self):
        '''Test that poorly aligned sequences identified correctly.'''

        test_to_align = path.join(test_dir_path, "seqs_to_align.fasta")
        test_aligned = path.join(test_dir_path, "hmmalign_out.fasta")

        test_to_align_in = read_fasta(test_to_align)
        test_aligned_in = read_fasta(test_aligned)

        exp_passing_seqs = [
            'barely_passable', 'fc72d6433952bdcfab2b357f4198bc2e',
            'fdae4a46c18c4727fe027a0fb8e57c8a',
            'feab23adead1ecdd465a0e900f45132f'
        ]

        obs_subset = check_alignments(raw_seqs=test_to_align_in,
                                      aligned_seqs=test_aligned_in,
                                      min_align=0.8,
                                      verbose=True)

        self.assertEqual(exp_passing_seqs, sorted(list(obs_subset.keys())))
    def test_run_papara(self):
        '''Basic test for run_papara function.'''

        exp_phylip = read_phylip(exp_papara_phylip)
        in_msa = read_fasta(test_msa)

        with TemporaryDirectory() as temp_dir:
            obs_phylip = run_papara(tree=test_tree,
                                    ref_msa=in_msa,
                                    out_dir=temp_dir,
                                    study_fasta=test_study_seqs)

        self.assertEqual(exp_phylip, obs_phylip)
Exemple #6
0
def place_seqs_pipeline(study_fasta, ref_msa, tree, out_tree, threads,
                        papara_output, out_dir, chunk_size, print_cmds):
    '''Full pipeline for running sequence placement.'''

    # Read in ref seqs FASTA as a dict.
    ref_msa = read_fasta(ref_msa)

    # Either read in PaPaRa output or run it.
    if papara_output:
        # Read in PaPaRa output if already done.
        papara_out = read_phylip(papara_output, check_input=True)

    else:
        # Run PaPaRa to place study sequences and read in Phylip file.
        papara_out = run_papara(tree=tree,
                                ref_msa=ref_msa,
                                study_fasta=study_fasta,
                                out_dir=out_dir,
                                threads=threads,
                                print_cmds=print_cmds)

    # Specify split FASTA files to be created.
    study_msa_fastafile = path.join(out_dir, "study_seqs_papara.fasta")
    ref_msa_fastafile = path.join(out_dir, "ref_seqs_papara.fasta")

    # Split PaPaRa output into two FASTA files containging study and reference
    # sequences respectively.
    split_ref_study_papara(papara_out=papara_out,
                           ref_seqnames=set(list(ref_msa.keys())),
                           study_fasta=study_msa_fastafile,
                           ref_fasta=ref_msa_fastafile)

    # Run EPA-NG to output .jplace file.
    epa_out_dir = path.join(out_dir, "epa_out")

    run_epa_ng(tree=tree,
               ref_msa_fastafile=ref_msa_fastafile,
               study_msa_fastafile=study_msa_fastafile,
               chunk_size=chunk_size,
               threads=threads,
               out_dir=epa_out_dir,
               print_cmds=print_cmds)

    jplace_outfile = path.join(epa_out_dir, "epa_result.jplace")

    gappa_jplace_to_newick(jplace_file=jplace_outfile,
                           outfile=out_tree,
                           print_cmds=print_cmds)
Exemple #7
0
    def test_read_write_fasta(self):
        '''Basic test that FASTA files are read and written correctly.'''

        test_seqs_dict = {
            "seq1": "GNATNGAC",
            "seq2": "GTCGTGGC",
            "seq3": "GNCTGAGATTAACC"
        }

        # Write these sequences temp file and then read them back in again.
        with TemporaryDirectory() as temp_dir:
            outfile = path.join(temp_dir, "test.fna")

            write_fasta(test_seqs_dict, outfile)

            test_seqs_dict_in = read_fasta(outfile)

        self.assertEqual(test_seqs_dict, test_seqs_dict_in)
def main():

    parser = argparse.ArgumentParser(

        description="Creates output FASTA for each sample with each ASV repeated for every count in that sample.",

formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("-f", "--fasta", metavar="FASTA", type=str,
                        help="Path to full FASTA file.", required=True)

    parser.add_argument("-b", "--biom", metavar="BIOM", type=str,
                        help="Path to BIOM table.", required=True)

    parser.add_argument("-o", "--outdir", metavar="PATH", type=str,
                        help="Name of folder to make for output files.", required=True)

    args = parser.parse_args()

    in_fasta = read_fasta(args.fasta)

    in_table = biom_to_pandas_df(biom.load_table(args.biom))

    # If no sequences in file then stop job.
    if not in_fasta:
        sys.exit("Stopping - no sequences in file.")

    make_output_dir(args.outdir)

    for sample in in_table.columns:
        sample_outfile = args.outdir + "/" + sample + ".fasta"

        sample_outfh = open(sample_outfile, 'wt')

        for asv in in_table.index.values:
            asv_count = in_table.loc[asv, sample]
            if asv_count > 0:
                for i in range(int(asv_count)):
                    print(">" + asv + "_" + sample + "_" + str(i), file=sample_outfh)
                    print(in_fasta[asv], file=sample_outfh)

        sample_outfh.close()
Exemple #9
0
def check_overlapping_seqs(in_seq, in_tab, verbose):
    '''Check that ASV ids overlap between the input FASTA and sequence
    abundance table. Will throw an error if none overlap and will otherwise
    print number of overlapping ids to STDERR.'''

    FASTA_ASVs = set(read_fasta(in_seq).keys())

    table_ASVs = set(read_seqabun(in_tab).index.values)

    num_ASV_overlap = len(table_ASVs.intersection(FASTA_ASVs))

    # Throw error if 0 ASVs overlap between the two files.
    if num_ASV_overlap == 0:
        sys.exit("Stopping - no ASV ids overlap between input FASTA and "
                 "sequence abundance table")

    # Otherwise print to STDERR how many ASVs overlap between the two files
    # if verbose set.
    if verbose:
        print(str(num_ASV_overlap) + " of " + str(len(table_ASVs)) +
              " sequence ids overlap between input table and FASTA.\n",
              file=sys.stderr)
Exemple #10
0
def place_seqs_pipeline(study_fasta, ref_msa, tree, hmm, out_tree,
                        alignment_tool, threads, out_dir, chunk_size,
                        print_cmds):
    '''Full pipeline for running sequence placement.'''

    if alignment_tool == "hmmalign":

        out_stockholm = path.join(out_dir, "query_align.stockholm")

        system_call_check("hmmalign  --trim --dna --mapali " + ref_msa +
                          " --informat FASTA -o " + out_stockholm + " " + hmm +
                          " " + study_fasta,
                          print_out=print_cmds)

        hmmalign_out = read_stockholm(out_stockholm, clean_char=True)

        # Specify split FASTA files to be created.
        study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta")
        ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta")

        ref_seqnames = set(list(read_fasta(ref_msa).keys()))

        study_seqnames = set(read_fasta(study_fasta).keys())

        ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames}
        study_hmmalign_subset = {
            seq: hmmalign_out[seq]
            for seq in study_seqnames
        }

        write_fasta(ref_hmmalign_subset, ref_msa_fastafile)
        write_fasta(study_hmmalign_subset, study_msa_fastafile)

    elif alignment_tool == "papara":

        # Read in ref seqs FASTA as a dict.
        ref_msa = read_fasta(ref_msa)

        # Run PaPaRa to place study sequences and read in Phylip file.
        papara_out = run_papara(tree=tree,
                                ref_msa=ref_msa,
                                study_fasta=study_fasta,
                                out_dir=out_dir,
                                threads=threads,
                                print_cmds=print_cmds)

        # Specify split FASTA files to be created.
        study_msa_fastafile = path.join(out_dir, "study_seqs_papara.fasta")
        ref_msa_fastafile = path.join(out_dir, "ref_seqs_papara.fasta")

        # Split PaPaRa output into two FASTA files containing study and reference
        # sequences respectively.
        split_ref_study_papara(papara_out=papara_out,
                               ref_seqnames=set(list(ref_msa.keys())),
                               study_fasta=study_msa_fastafile,
                               ref_fasta=ref_msa_fastafile)

    # Run EPA-NG to output .jplace file.
    epa_out_dir = path.join(out_dir, "epa_out")

    run_epa_ng(tree=tree,
               ref_msa_fastafile=ref_msa_fastafile,
               study_msa_fastafile=study_msa_fastafile,
               chunk_size=chunk_size,
               threads=threads,
               out_dir=epa_out_dir,
               print_cmds=print_cmds)

    jplace_outfile = path.join(epa_out_dir, "epa_result.jplace")

    gappa_jplace_to_newick(jplace_file=jplace_outfile,
                           outfile=out_tree,
                           print_cmds=print_cmds)
def main():

    parser = argparse.ArgumentParser(
        description="Reads in FASTA file and keeps single sequence (or "
        "possibly no sequence). Works by first screening out all "
        "sequences of length less or greater than the lower and "
        "upper bounds given. Will then screen out sequences with "
        "greater than a set percent of Ns. Then preferentially keeps "
        "sequences with lower proportion of Ns. Finally if there is "
        "still a tie, will just choose a sequence randomly. Output file only "
        "created if there is a final sequence to be written.",
        formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument("-f",
                        "--fasta",
                        metavar="FASTA",
                        type=str,
                        help="Path to FASTA file",
                        required=True)

    parser.add_argument("-l",
                        "--lower_length",
                        metavar="SIZE",
                        type=int,
                        required=True,
                        help="Min length of each seq in FASTA")

    parser.add_argument("-u",
                        "--upper_length",
                        metavar="SIZE",
                        type=int,
                        required=True,
                        help="Max length of each seq in FASTA")

    parser.add_argument("-p",
                        "--prop_n",
                        metavar="SIZE",
                        type=float,
                        required=True,
                        help="Proportion of N characters "
                        "permitted per-sequence")

    parser.add_argument("-o",
                        "--output_dir",
                        metavar="PATH",
                        type=str,
                        required=True,
                        help="Output directory to write final FASTA of single "
                        "sequence IF there is a sequence left to write "
                        "after filtering.")

    parser.add_argument(
        "--rename_seq",
        action="store_true",
        help="Flag to indicate that the sequence header should "
        "be renamed to be the the first 2 fields of the "
        "filenames after delimiting by \'_\'")

    parser.add_argument(
        "--rename_seq_full",
        action="store_true",
        help="Flag to indicate that the sequence header should "
        "be renamed to be the full filename.")

    args = parser.parse_args()

    in_fasta = read_fasta(args.fasta)

    # If no sequences in file then stop job.
    if not in_fasta:
        sys.exit("Stopping - no sequences in file.")

    # Remove all sequences with length outside cut-off range or with greater
    # than the specified proportion of N characters.
    seq2remove = set()

    seq_N_pro = {}

    for seq_id, sequence in in_fasta.items():

        seq_len = len(sequence)

        N_pro = sequence.upper().count("N") / seq_len

        if seq_len < args.lower_length or seq_len > args.upper_length or N_pro > args.prop_n:
            seq2remove.add(seq_id)
        else:
            seq_N_pro[seq_id] = N_pro

    # Remove the specified sequences.
    for seq_id in seq2remove:
        del in_fasta[seq_id]

    # If no sequences in file then stop job.
    if not in_fasta:
        sys.exit("Stopping - no sequences left after filtering.")

    # Of remaining sequences figure out which has the lower proportion of Ns.
    # If there is a tie then the first sequence is taken (since dictionary keys
    # are unordered this results in a random selection).
    best_seq = None
    lowest_pro_N = 1.1

    for seq_id, pro_N in seq_N_pro.items():
        if pro_N < lowest_pro_N:
            best_seq = seq_id

    out_basename = path.splitext(path.basename(args.fasta))[0]

    outfile = path.join(args.output_dir, out_basename + "_best.fna")

    # Add the best sequence to a dictionary so it can be output easily.
    out_seq = {}

    # If rename_seq option set then replace current header with first 2 fields
    # of filename after delimiting by "_".
    if args.rename_seq:
        file_split = out_basename.split("_")
        seqname = file_split[0] + "_" + file_split[1]
    elif args.rename_seq_full:
        seqname = out_basename
    else:
        seqname = best_seq

    out_seq[seqname] = in_fasta[best_seq].upper()

    write_fasta(out_seq, outfile)
Exemple #12
0
def place_seqs_pipeline(study_fasta,
                        ref_dir,
                        out_tree,
                        threads,
                        out_dir,
                        min_align,
                        chunk_size,
                        verbose):
    '''Full pipeline for running sequence placement.'''

    # Throw error if there is a space in the study FASTA filepath.
    if " " in study_fasta:
        sys.exit("Stopping - remove the space from the input FASTA filepath.")

    # Identify reference files to use.
    ref_msa, tree, hmm, model = identify_ref_files(ref_dir)

    # Run hmmalign to place study sequences into reference MSA.
    out_stockholm = path.join(out_dir, "query_align.stockholm")

    system_call_check("hmmalign --trim --dna --mapali " +
                      ref_msa + " --informat FASTA -o " +
                      out_stockholm + " " + hmm + " " + study_fasta,
                      print_command=verbose, print_stdout=verbose,
                      print_stderr=verbose)

    hmmalign_out = read_stockholm(out_stockholm, clean_char=True)

    # Specify split FASTA files to be created.
    study_msa_fastafile = path.join(out_dir, "study_seqs_hmmalign.fasta")
    ref_msa_fastafile = path.join(out_dir, "ref_seqs_hmmalign.fasta")

    ref_seqnames = set(list(read_fasta(ref_msa).keys()))
    study_seqs = read_fasta(study_fasta)

    ref_hmmalign_subset = {seq: hmmalign_out[seq] for seq in ref_seqnames}

    study_hmmalign_subset = check_alignments(raw_seqs=study_seqs,
                                             aligned_seqs=hmmalign_out,
                                             min_align=min_align,
                                             verbose=verbose)

    write_fasta(ref_hmmalign_subset, ref_msa_fastafile)
    write_fasta(study_hmmalign_subset, study_msa_fastafile)

    # Run EPA-ng to place input sequences and output JPLACE file.
    epa_out_dir = path.join(out_dir, "epa_out")

    run_epa_ng(tree=tree,
               ref_msa_fastafile=ref_msa_fastafile,
               study_msa_fastafile=study_msa_fastafile,
               model=model,
               chunk_size=chunk_size,
               threads=threads,
               out_dir=epa_out_dir,
               print_cmds=verbose)

    jplace_outfile = path.join(epa_out_dir, "epa_result_parsed.jplace")

    gappa_jplace_to_newick(jplace_file=jplace_outfile,
                           outfile=out_tree,
                           print_cmds=verbose)