Beispiel #1
0
    def estimate(self, hits, outfile, placements):
        hit = {}
        logging.info("Estimating scores now")

        if self.cfg["touch"]:
            file.touch(outfile)
            logging.info("Returning as we only touch")
            return

        r = base.readTSV(hits)
        # count profile hits
        for row in r:
            if row["profile"] not in hit.keys():
                hit[row["profile"]] = 1
            else:
                hit[row["profile"]] += 1

        singletons = set(hit.keys())
        multitons = set([k for k, v in hit.items() if v > 1])

        # now we can estimate completeness and contamination for each placement
        for i in range(len(placements)):
            s = self.readSet(placements[i]["node"])
            # completeness is the overap of both sets
            cmpl = len(singletons & s) / len(s)
            cont = len(multitons & s) / len(s)

            # make to percentage and round to 2 positions
            placements[i]["completeness"] = round(cmpl * 100, 2)
            placements[i]["contamination"] = round(cont * 100, 2)

        log("Finished estimating")

        # write to output file
        k = [
            "completeness",
            "contamination",
            "node",
            "n",
            "ngenomes",
            "cover",
            "nPlacements",
            "taxid",
            "lineage",
            "taxidlineage",
            "file",
        ]
        with open(outfile, "w") as f:
            f.write("{}\n".format("\t".join(k)))
            for p in placements:
                # insert the file name
                p["file"] = self.cfg["name"]
                # write to file
                f.write("{}\n".format("\t".join([str(p[key]) for key in k])))

        log("Wrote estimates to: {}".format(outfile))

        # done
        return True
Beispiel #2
0
    def get_silent_contig(self, fasta, hits, placements):
        """
        given hits and the fasta file, we can calculate how many
        DNA bp can not be detected as contaminant, because no markers
        are on these contigs
        """
        logging.debug("Calculating silent contig fraction")
        if fasta is None:
            logging.debug(
                "As no DNA fasta was given, we can't compute the contig fraction"
            )
            for placement in placements:
                placement["max_silent_contamination"] = "NA"
            return

        # get dict to link profile to contigs
        profile_2_contigs = defaultdict(set)
        for row in base.readTSV(hits):
            profile_2_contigs[row["profile"]].add(row["chrom"])

        # read in fasta once
        dna_lens = {}
        for rec in Fasta(fasta):
            dna_lens[rec.name] = len(str(rec))
        total_len = sum([v for k, v in dna_lens.items()])

        # for each placement compute the silent fraction
        for placement in placements:
            contigs = set()
            # make union of contigs
            for profile in placement["set"]:
                contigs = contigs | profile_2_contigs[profile]
            # define missing contigs as contig names we did not locate any marker genes on
            m_contigs = set(dna_lens.keys()) - contigs
            missing_len = sum([dna_lens[contig] for contig in m_contigs])
            placement["max_silent_contamination"] = round(
                missing_len / total_len * 100, 2)
            logging.debug("The silent fraction could be up to {}%".format(
                placement["max_silent_contamination"]))
        return
Beispiel #3
0
    def estimate(self, hits, outfile, placements):
        hit = {}
        logging.info("Estimating scores now")

        if self.cfg["touch"]:
            file.touch(outfile)
            logging.info("Returning as we only touch")
            return

        r = base.readTSV(hits)
        # count profile hits
        for row in r:
            if row["profile"] not in hit.keys():
                hit[row["profile"]] = 1
            else:
                hit[row["profile"]] += 1

        singletons = set(hit.keys())
        multitons = set([k for k, v in hit.items() if v > 1])

        # now we can estimate completeness and contamination for each placement
        for i in range(len(placements)):
            s = self.readSet(placements[i]["node"])
            placements[i]["set"] = s
            # completeness is the overap of both sets
            cmpl = len(singletons & s) / len(s)
            cont = len(multitons & s) / len(s)

            # make to percentage and round to 2 positions
            placements[i]["completeness"] = round(cmpl * 100, 2)
            placements[i]["contamination"] = round(cont * 100, 2)

        # compute silent fraction per placement and set
        self.get_silent_contig(self._clean_fasta, hits, placements)

        log("Finished estimating")
        self.write_outfile(outfile, placements)

        # done
        return True
Beispiel #4
0
    def prepareAlignment(
        self, pplaceAlinment, hmmerOutput, proteinList, proteinFasta, config, cfg, tmpDir,
    ):
        # for each profile that we found a SCMG
        # make concatenated fasta (including new sequence)
        # align that to the hmm profile
        profiles = []
        with open(proteinList) as pl:
            for l in pl:
                profiles.append(l.strip())
        # read in hmmer output to get seqnames of target proteins
        # cols = []
        # hmmer = []
        # scmgsr = []

        # with open(hmmerOutput) as ho:
        #    for line in ho:
        #        l = line.split()
        #        if len(cols) == 0:
        #            cols = l
        #        else:
        #            n = {}
        #            for k,v in zip(cols, l):
        #                n[k] = v
        #            hmmer.append(n)
        #            scmgsr.append(n['profile'])

        # load hmmer results
        hmmer = base.readTSV(hmmerOutput)
        # extract SCMGs
        scmgsr = [hit["profile"] for hit in hmmer]
        # scmgs = [p for p in set(scmgsr) if scmgsr.count(p) == 1]
        # db = [p for p in set(scmgsr) if scmgsr.count(p) == 2]
        scmgs = scmgsr
        # scmgs[-1] = db[0]
        scmgs.sort()
        # count the placements
        self.lenscmgs = len(scmgs)

        if self.lenscmgs > 0:
            # get the protein names for each scmgs
            proteinnames = []
            # for all profiles extract proteins
            hmmr = {}
            # load into dict
            for row in hmmer:
                hmmr[row["profile"]] = row["subject"]
            # make list of protein names
            for m in scmgs:
                proteinnames.append(hmmr[m])

            # load proteins
            proteins = Fasta(proteinFasta)

            alignments = []
            # for each protein/SCMG we need to make an alignment
            for p, g in zip(scmgs, proteinnames):
                # write seq to file
                seq = proteins[g]
                name = "{}_{}".format(p, g)
                tmpfasta = os.path.join(tmpDir, "{}.faa".format(name))
                geneAlignment = os.path.join(tmpDir, "{}.aln".format(name))
                tmpfasta = base.writeFasta(tmpfasta, name, seq)
                profileAlignment = config.pkgfile("{}.refpkg".format(p), "aln_fasta")
                profileHMM = config.pkgfile("{}.refpkg".format(p), "profile")
                # start and run alignment of this profile
                ha = hmmalign("hmmalign", tmpfasta, geneAlignment)
                ha.run(profileAlignment, profileHMM)
                alignments.append(geneAlignment)

            # after this we concatenate the alignments
            # ordered by the profile name
            self.input = base.horizontalConcat(
                pplaceAlinment, alignments, scmgs, config.pkgfile("{}.refpkg".format("concat"), "aln_fasta"),
            )