Beispiel #1
0
 def search_species(self, seqs):
     with open(self.species_fp) as f:
         ref_seqs = list(parse_fasta(f, trim_desc=True))
     xt = HitExtender(seqs, ref_seqs)
     with open(self.output_fp) as of:
         hits = VsearchAligner._parse(of)
         for hit in hits:
             yield xt.extend_hit(hit)
Beispiel #2
0
 def _get_subject_seq(self, subject_id):
     subject_outfile = tempfile.NamedTemporaryFile()
     subject_outfile_fp = subject_outfile.name
     args = [
         "blastdbcmd", "-db", self.db, "-entry", subject_id, "-out",
         subject_outfile_fp
     ]
     subprocess.check_call(args)
     with open(subject_outfile_fp) as f:
         return list(parse_fasta(f, trim_desc=True))[0][1]
Beispiel #3
0
 def load(self, assemblies):
     with open(self.accession_fp, "r") as f:
         for line in f:
             toks = line.strip().split()
             seqid = toks[0]
             accession = toks[1]
             assembly = assemblies[accession]
             self.assemblies[seqid] = assembly
             self.seqids_by_assembly[assembly.accession].append(seqid)
     with open(self.fasta_fp, "r") as f:
         for seqid, seq in parse_fasta(f):
             self.seqs[seqid] = seq
Beispiel #4
0
 def test_parse_fasta(self):
     res = parse_fasta([
         ">Seq1 abc def\n",
         "GGCTGCTATCAG\n",
         "CTAGCATCGTCGCATCGAC\n",
         ">Seq2\n",
         "ACGCTAGCTGCAAAA\n",
     ])
     self.assertEqual(next(res),
                      ("Seq1 abc def", "GGCTGCTATCAGCTAGCATCGTCGCATCGAC"))
     self.assertEqual(next(res), ("Seq2", "ACGCTAGCTGCAAAA"))
     self.assertRaises(StopIteration, next, res)
Beispiel #5
0
    def __init__(self,
                 species_file,
                 ref_fp,
                 mismatch_file,
                 batch_size=10,
                 num_cpus=None):
        self.typestrain_seqs = list(parse_fasta(species_file, trim_desc=True))
        self.reference_fasta_fp = ref_fp
        self.mismatch_file = mismatch_file

        self.min_pct_id = 97.0
        self.num_threads = num_cpus
        self.batch_size = batch_size
        self.max_hits = "10000"
Beispiel #6
0
 def ssu_seqs(self):
     if self._ssu_seqs is not None:
         return self._ssu_seqs
     if not os.path.exists(self.rna_fp):
         try:
             self.download_rna()
         except urllib.error.HTTPError as e:
             print(self.accession)
             print(e)
             return []
     with open(self.rna_fp, "rt") as f:
         seqs = list(parse_fasta(f))
     res = [(desc, seq) for (desc, seq) in seqs if is_16S(desc)]
     self._ssu_seqs = res
     return res
Beispiel #7
0
    def search_species(self, query_seqs):
        b = VsearchAligner(self.species_fp)
        vsearch_args = {
            "min_id": 0.9,
            "maxaccepts": 5,
        }
        if self.num_cpus:
            vsearch_args["threads"] = self.num_cpus
        hits = b.search(query_seqs, self.species_input_fp,
                        self.species_output_fp, **vsearch_args)

        with open(self.species_fp) as f:
            ref_seqs = list(parse_fasta(f, trim_desc=True))
        xt = HitExtender(query_seqs, ref_seqs)
        for hit in hits:
            yield xt.extend_hit(hit)
Beispiel #8
0
def process_greengenes_seqs(seqs_fp,
                            accessions_fp,
                            output_fp=REFSEQS_FASTA_FP):
    duplicates_fp = GG_DUPLICATE_FP
    if os.path.isdir(output_fp):
        duplicates_fp = os.path.join(output_fp, duplicates_fp)
        output_fp = os.path.join(output_fp, REFSEQS_FASTA_FP)

    # Extract table of accessions
    if accessions_fp.endswith(".gz"):
        subprocess.check_call(["gunzip", "-f", accessions_fp])
        accessions_fp = gunzip_fp(accessions_fp)

    # Load accessions
    gg_accessions = {}
    with open(accessions_fp) as f:
        for ggid, src, acc in parse_greengenes_accessions(f):
            gg_accessions[ggid] = (acc, src)

    # Extract FASTA file
    if seqs_fp.endswith(".gz"):
        subprocess.check_call(["gunzip", "-f", seqs_fp])
        seqs_fp = gunzip_fp(seqs_fp)

    # Remove duplicate reference seqs
    uniq_seqs = collections.defaultdict(list)
    with open(seqs_fp) as f:
        for ggid, seq in parse_fasta(f):
            uniq_seqs[seq].append(ggid)

    with open(duplicates_fp, "w") as dups:
        with open(output_fp, "w") as f:
            for seq, ggids in uniq_seqs.items():
                ggid = ggids[0]
                if len(ggids) > 1:
                    dups.write(" ".join(ggids))
                # Re-label seqs with accession numbers
                acc, src = gg_accessions[ggid]
                f.write(">%s %s %s\n%s\n" % (acc, src, ggid, seq))

    return output_fp
Beispiel #9
0
def process_ltp_seqs(input_fp, output_fp=SPECIES_FASTA_FP):
    if os.path.isdir(output_fp):
        output_fp = os.path.join(output_fp, SPECIES_FASTA_FP)
    accession_cts = collections.defaultdict(int)
    # Re-format FASTA file
    with open(input_fp) as f_in:
        seqs = parse_fasta(f_in)
        with open(output_fp, "w") as f_out:
            for desc, seq in seqs:
                vals = desc.split("|")
                # Some accessions refer to genomes with more than one 16S gene
                # So accessions can be legitiamtely repeated with distinct gene sequences
                accession = vals[2]
                accession_times_previously_seen = accession_cts[accession]
                accession_cts[accession] += 1
                if accession_times_previously_seen > 0:
                    accession = "{0}_repeat{1}".format(
                        accession, accession_times_previously_seen)
                species_name = vals[3]
                f_out.write(">{0}\t{1}\n{2}\n".format(accession, species_name,
                                                      seq))
    return output_fp
Beispiel #10
0
def main(argv=None):
    p = argparse.ArgumentParser()
    p.add_argument("query_fasta",
                   type=argparse.FileType("r"),
                   help="Query sequences FASTA file")
    p.add_argument(
        "--output_dir",
        help=("Output directory (default: basename of query sequences FASTA "
              "file, plus '_unassigned')"))
    p.add_argument(
        "--type_strain_fasta",
        default="unassigner_species.fasta",
        help=("Type strain sequences FASTA file (default: %(default)s). "
              "If the default file is not found, sequences are downloaded "
              "and re-formatted automatically."))
    p.add_argument(
        "--threshold",
        type=float,
        help=("Sequence identity threshold for ruling out species-level "
              "compatibility. Default value is 0.975 for the standard "
              "algorithm and 0.991 for the soft threshold algorithm."))
    p.add_argument(
        "--ref_mismatch_positions",
        help=("File of mismatch positions in reference database. The file may "
              "be compressed in gzip format."))
    p.add_argument(
        "--num_cpus",
        type=int,
        help=("Number of CPUs to use during sequence aligment (default: "
              "use all the CPUs)"))
    p.add_argument("--soft_threshold",
                   action="store_true",
                   help="Use soft threshold algorithm.")
    p.add_argument("--verbose",
                   action="store_true",
                   help="Activate verbose mode.")
    args = p.parse_args(argv)

    if args.threshold is None:
        if args.soft_threshold:
            min_id = 0.991
        else:
            min_id = 0.975
    else:
        min_id = args.threshold

    if args.verbose is True:
        logging.basicConfig(format='%(levelname)s: %(message)s',
                            level=logging.INFO)

    query_seqs = list(parse_fasta(args.query_fasta, trim_desc=True))

    if args.output_dir is None:
        output_dir = os.path.splitext(args.query_fasta.name)[0] + "_unassigned"
    else:
        output_dir = args.output_dir

    # Download type strain files if needed
    type_strain_fp_is_default = (
        args.type_strain_fasta == p.get_default("type_strain_fasta"))
    type_strain_fp_is_missing = not os.path.exists(args.type_strain_fasta)
    if type_strain_fp_is_default and type_strain_fp_is_missing:
        download_type_strain_data()

    with open(args.type_strain_fasta) as f:
        species_names = dict(parse_species_names(f))

    writer = OutputWriter(output_dir, species_names)

    alignment_query_fp = writer.output_fp("unassigner_query.fasta")
    alignment_output_fp = writer.output_fp("unassigner_query_hits.txt")
    if os.path.exists(alignment_output_fp):
        a = FileAligner(args.type_strain_fasta, alignment_output_fp)
    else:
        a = UnassignAligner(args.type_strain_fasta)
        a.species_input_fp = alignment_query_fp
        a.species_output_fp = alignment_output_fp
        a.num_cpus = args.num_cpus

    if args.ref_mismatch_positions:
        if args.ref_mismatch_positions.endswith(".gz"):
            mm_db_file = gzip.open(args.ref_mismatch_positions, "rt")
        else:
            mm_db_file = open(args.ref_mismatch_positions)
        VariableMismatchRate.load_database(mm_db_file)

    app = UnassignerApp(a,
                        VariableMismatchRate,
                        min_id=min_id,
                        soft_threshold=args.soft_threshold)
    for query_id, query_results in app.unassign(query_seqs):
        writer.write_results(query_id, query_results)
Beispiel #11
0
 def test_parse_empty_fasta(self):
     res = parse_fasta([])
     list_res = list(res)
     self.assertEqual(list_res, [])
Beispiel #12
0
 def from_fasta(cls, f):
     recs = parse_fasta(f)
     return cls(recs)
Beispiel #13
0
def main(argv=None):
    p = argparse.ArgumentParser()
    p.add_argument("query_fasta",
                   type=argparse.FileType("r"),
                   help="Query sequences FASTA file")
    p.add_argument(
        "--output_dir",
        help=("Output directory (default: basename of query sequences FASTA "
              "file, plus '_unassigned')"))
    p.add_argument(
        "--type_strain_fasta",
        default="unassigner_species.fasta",
        help=("Type strain sequences FASTA file (default: %(default)s). "
              "If the default file is not found, sequences are downloaded "
              "and re-formatted automatically."))
    p.add_argument(
        "--num_cpus",
        type=int,
        help=("Number of CPUs to use during sequence aligment (default: "
              "use all the CPUs)"))
    p.add_argument("--verbose",
                   action="store_true",
                   help="Activate verbose mode.")
    args = p.parse_args(argv)

    if args.verbose is True:
        logging.basicConfig(format='%(levelname)s: %(message)s',
                            level=logging.INFO)

    query_seqs = list(parse_fasta(args.query_fasta, trim_desc=True))

    if args.output_dir is None:
        output_dir = os.path.splitext(args.query_fasta.name)[0] + "_unassigned"
    else:
        output_dir = args.output_dir

    # Download type strain files if needed
    type_strain_fp_is_default = (
        args.type_strain_fasta == p.get_default("type_strain_fasta"))
    type_strain_fp_is_missing = not os.path.exists(args.type_strain_fasta)
    if type_strain_fp_is_default and type_strain_fp_is_missing:
        download_type_strain_data()

    with open(args.type_strain_fasta) as f:
        species_names = dict(parse_species_names(f))

    writer = OutputWriter(output_dir, species_names)

    alignment_query_fp = writer.output_fp("unassigner_query.fasta")
    alignment_output_fp = writer.output_fp("unassigner_query_hits.txt")
    if os.path.exists(alignment_output_fp):
        a = FileAligner(args.type_strain_fasta, alignment_output_fp)
    else:
        a = UnassignAligner(args.type_strain_fasta)
        a.species_input_fp = alignment_query_fp
        a.species_output_fp = alignment_output_fp
        a.num_cpus = args.num_cpus

    algorithm = ThresholdAlgorithm(a)
    for query_id, query_results in algorithm.unassign(query_seqs):
        writer.write_results(query_id, query_results)