def check_index(self, filename, format, **kwargs):
        # check if Python3 installation has sqlite3
        try:
            import sqlite3
        except ImportError:
            sqlite3 = None

        parsed = list(SearchIO.parse(filename, format, **kwargs))
        # compare values by index
        indexed = SearchIO.index(filename, format, **kwargs)
        self.assertEqual(len(parsed), len(indexed.keys()))
        # compare values by index_db, only if sqlite3 is present
        if sqlite3 is not None:
            db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs)
            self.assertEqual(len(parsed), len(db_indexed.keys()))

        for qres in parsed:
            idx_qres = indexed[qres.id]
            # parsed and indexed qresult are different objects!
            self.assertNotEqual(id(qres), id(idx_qres))
            # but they should have the same attribute values
            self.assertTrue(compare_search_obj(qres, idx_qres))
            # sqlite3 comparison, only if it's present
            if sqlite3 is not None:
                dbidx_qres = db_indexed[qres.id]
                self.assertNotEqual(id(qres), id(dbidx_qres))
                self.assertTrue(compare_search_obj(qres, dbidx_qres))

        indexed._proxy._handle.close()  # TODO - Better solution
        if sqlite3 is not None:
            db_indexed.close()
            db_indexed._con.close()
    def check_raw(self, filename, id, raw, **kwargs):
        """Index filename using keyword arguments, check get_raw(id)==raw."""
        idx = SearchIO.index(filename, self.fmt, **kwargs)
        raw = _as_bytes(raw)
        # Anticipate cases where the raw string and/or file uses different
        # newline characters ~ we set everything to \n.
        new = idx.get_raw(id)
        self.assertTrue(isinstance(new, bytes),
                        "Didn't get bytes from %s get_raw" % self.fmt)
        self.assertEqual(raw.replace(b'\r\n', b'\n'),
                         new.replace(b'\r\n', b'\n'))
        idx.close()

        # Now again, but using SQLite backend
        if sqlite3:
            idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs)
            new = idx.get_raw(id)
            self.assertTrue(isinstance(new, bytes),
                            "Didn't get bytes from %s get_raw" % self.fmt)
            self.assertEqual(raw.replace(b'\r\n', b'\n'),
                             new.replace(b'\r\n', b'\n'))
            idx.close()

        if os.path.isfile(filename + ".bgz"):
            # Do the tests again with the BGZF compressed file
            print("[BONUS %s.bgz]" % filename)
            self.check_raw(filename + ".bgz", id, raw, **kwargs)
    def check_raw(self, filename, id, raw, **kwargs):
        """Index filename using keyword arguments, check get_raw(id)==raw."""
        idx = SearchIO.index(filename, self.fmt, **kwargs)
        raw = _as_bytes(raw)
        # Anticipate cases where the raw string and/or file uses different
        # newline characters ~ we set everything to \n.
        new = idx.get_raw(id)
        self.assertTrue(isinstance(new, bytes),
                        "Didn't get bytes from %s get_raw" % self.fmt)
        self.assertEqual(raw.replace(b'\r\n', b'\n'),
                         new.replace(b'\r\n', b'\n'))
        idx.close()

        # Now again, but using SQLite backend
        if sqlite3:
            idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs)
            new = idx.get_raw(id)
            self.assertTrue(isinstance(new, bytes),
                            "Didn't get bytes from %s get_raw" % self.fmt)
            self.assertEqual(raw.replace(b'\r\n', b'\n'),
                             new.replace(b'\r\n', b'\n'))
            idx.close()

        if os.path.isfile(filename + ".bgz"):
            # Do the tests again with the BGZF compressed file
            print("[BONUS %s.bgz]" % filename)
            self.check_raw(filename + ".bgz", id, raw, **kwargs)
Beispiel #4
0
    def parse_search_file(input_file, mode, format="hmmer3-text", index_file=None):
        if mode == "index_db" or ((not isinstance(input_file, str)) and (len(input_file) > 1)):
            index = index_file if index_file else "tmp.idx"
            seq_dict = SearchIO.index_db(index, [input_file] if isinstance(input_file, str) else input_file, format=format)
        elif mode == "index":
            seq_dict = SearchIO.index(input_file if isinstance(input_file, str) else input_file[0], format=format)
        elif mode == "parse":
            seq_dict = OrderedDict()
            for record in SearchIO.parse(input_file if isinstance(input_file, str) else input_file[0], format=format):
                seq_dict[record.id] = record
            #seq_dict = SeqIO.to_dict(SeqIO.parse(input_file if isinstance(input_file, str) else input_file[0], format=format))

        return seq_dict
    def check_index(self, filename, format, **kwargs):
        # check if Python3 installation has sqlite3
        try:
            import sqlite3
        except ImportError:
            sqlite3 = None

        if filename.endswith(".bgz"):
            handle = gzip.open(filename)
            parsed = list(SearchIO.parse(handle, format, **kwargs))
            handle.close()
        else:
            parsed = list(SearchIO.parse(filename, format, **kwargs))
        # compare values by index
        indexed = SearchIO.index(filename, format, **kwargs)
        self.assertEqual(
            len(parsed), len(indexed),
            "Should be %i records in %s, index says %i" %
            (len(parsed), filename, len(indexed)))
        # compare values by index_db, only if sqlite3 is present
        if sqlite3 is not None:
            db_indexed = SearchIO.index_db(':memory:', [filename], format,
                                           **kwargs)
            self.assertEqual(
                len(parsed), len(db_indexed),
                "Should be %i records in %s, index_db says %i" %
                (len(parsed), filename, len(db_indexed)))

        for qres in parsed:
            idx_qres = indexed[qres.id]
            # parsed and indexed qresult are different objects!
            self.assertNotEqual(id(qres), id(idx_qres))
            # but they should have the same attribute values
            self.assertTrue(compare_search_obj(qres, idx_qres))
            # sqlite3 comparison, only if it's present
            if sqlite3 is not None:
                dbidx_qres = db_indexed[qres.id]
                self.assertNotEqual(id(qres), id(dbidx_qres))
                self.assertTrue(compare_search_obj(qres, dbidx_qres))

        indexed.close()
        if sqlite3 is not None:
            db_indexed.close()
            db_indexed._con.close()

        if os.path.isfile(filename + ".bgz"):
            # Do the tests again with the BGZF compressed file
            print("[BONUS %s.bgz]" % filename)
            self.check_index(filename + ".bgz", format, **kwargs)
    def check_raw(self, filename, id, raw, **kwargs):
        """Index filename using **kwargs, check get_raw(id)==raw."""
        idx = SearchIO.index(filename, self.fmt, **kwargs)
        raw = _as_bytes(raw)
        self.assertEqual(raw, idx.get_raw(id))
        idx.close()

        #Now again, but using SQLite backend
        if sqlite3:
            idx = SearchIO.index_db(":memory:", filename, self.fmt, **kwargs)
            self.assertEqual(raw, idx.get_raw(id))
            idx.close()

        if os.path.isfile(filename + ".bgz"):
            #Do the tests again with the BGZF compressed file
            print "[BONUS %s.bgz]" % filename
            self.check_raw(filename + ".bgz", id, raw, **kwargs)
    def check_index(self, filename, format, **kwargs):
        # check if Python3 installation has sqlite3
        try:
            import sqlite3
        except ImportError:
            sqlite3 = None

        if filename.endswith(".bgz"):
            handle = gzip.open(filename)
            parsed = list(SearchIO.parse(handle, format, **kwargs))
            handle.close()
        else:
            parsed = list(SearchIO.parse(filename, format, **kwargs))
        # compare values by index
        indexed = SearchIO.index(filename, format, **kwargs)
        self.assertEqual(len(parsed), len(indexed),
                         "Should be %i records in %s, index says %i"
                         % (len(parsed), filename, len(indexed)))
        # compare values by index_db, only if sqlite3 is present
        if sqlite3 is not None:
            db_indexed = SearchIO.index_db(':memory:', [filename], format, **kwargs)
            self.assertEqual(len(parsed), len(db_indexed),
                             "Should be %i records in %s, index_db says %i"
                             % (len(parsed), filename, len(db_indexed)))

        for qres in parsed:
            idx_qres = indexed[qres.id]
            # parsed and indexed qresult are different objects!
            self.assertNotEqual(id(qres), id(idx_qres))
            # but they should have the same attribute values
            self.assertTrue(compare_search_obj(qres, idx_qres))
            # sqlite3 comparison, only if it's present
            if sqlite3 is not None:
                dbidx_qres = db_indexed[qres.id]
                self.assertNotEqual(id(qres), id(dbidx_qres))
                self.assertTrue(compare_search_obj(qres, dbidx_qres))

        indexed.close()
        if sqlite3 is not None:
            db_indexed.close()
            db_indexed._con.close()

        if os.path.isfile(filename + ".bgz"):
            # Do the tests again with the BGZF compressed file
            print("[BONUS %s.bgz]" % filename)
            self.check_index(filename + ".bgz", format, **kwargs)
Beispiel #8
0
    def check_index(self, filename, format, **kwargs):
        if filename.endswith(".bgz"):
            with gzip.open(filename) as handle:
                parsed = list(SearchIO.parse(handle, format, **kwargs))
        else:
            parsed = list(SearchIO.parse(filename, format, **kwargs))
        # compare values by index
        indexed = SearchIO.index(filename, format, **kwargs)
        self.assertEqual(
            len(parsed),
            len(indexed),
            "Should be %i records in %s, index says %i" %
            (len(parsed), filename, len(indexed)),
        )
        # compare values by index_db, only if sqlite3 is present
        if sqlite3 is not None:
            db_indexed = SearchIO.index_db(":memory:", [filename], format,
                                           **kwargs)
            self.assertEqual(
                len(parsed),
                len(db_indexed),
                "Should be %i records in %s, index_db says %i" %
                (len(parsed), filename, len(db_indexed)),
            )

        for qres in parsed:
            idx_qres = indexed[qres.id]
            # parsed and indexed qresult are different objects!
            self.assertNotEqual(id(qres), id(idx_qres))
            # but they should have the same attribute values
            self.compare_search_obj(qres, idx_qres)
            # sqlite3 comparison, only if it's present
            if sqlite3 is not None:
                dbidx_qres = db_indexed[qres.id]
                self.assertNotEqual(id(qres), id(dbidx_qres))
                self.compare_search_obj(qres, dbidx_qres)

        indexed.close()
        if sqlite3 is not None:
            db_indexed.close()
            db_indexed._con.close()

        if os.path.isfile(filename + ".bgz"):
            # Do the tests again with the BGZF compressed file
            print(f"[BONUS {filename}.bgz]")
            self.check_index(filename + ".bgz", format, **kwargs)
Beispiel #9
0
    if not (os.path.isfile(db_fasta + ".nhr") and \
            os.path.isfile(db_fasta + ".nin") and \
            os.path.isfile(db_fasta + ".nsq")):
        stop_err("Missing BLAST database for %s" % db_fasta)
    cmd = NcbiblastnCommandline(query=query_fasta, db=db_fasta,
                                out=blast_file, outfmt=6,
                                evalue=1e-5)
    print cmd
    stdout, stderr = cmd()
    return

if not os.path.isfile(blast_file):
    do_blast(assembly_fasta, reference_fasta, blast_file)

contigs = SeqIO.index(assembly_fasta, "fasta")
blast_results = SearchIO.index(blast_file, "blast-tab")

reference_parser = SeqIO.parse(reference_fasta, "fasta")

fasta_handle = open(output_fasta, "w")
fasta_saved_count = 0
fasta_short_dropped = 0

offset = 0
ref_offsets = dict()
for record in reference_parser:
    ref_offsets[hack_ncbi_fasta_name(record.id)] = offset
    offset += len(record)

def reverse_complement_hsp_fragment(frag, query_length):
    rev = SearchIO.HSPFragment(hit_id=frag.hit_id, query_id=frag.query_id)
Beispiel #10
0
from Bio import SearchIO
result_handle = open("results/blastp_resultAD.txt")
idx = SearchIO.index('results/blastp_resultAD.txt',
                     'blast-tab')  # comments=True)
idss = (idx.keys())
homs = {}
rec_list_all = []

for ids in idss:
    rec_list = []
    for rec in idx[ids].hsps:
        rec_list.append((rec.hit_id, rec.evalue, rec.ident_pct))
        rec_list_all.append(
            (rec.query_id, rec.hit_id, rec.evalue, rec.ident_pct))
    homs[ids] = {ids: rec_list}
leg_uniq = []
ids = []
for p in sorted(rec_list_all, key=lambda x: x[0][1]):
    if p[2] < 10E-3 and p[1] not in p[0]:
        if p[1] not in ids:
            ids.append(p[1])
            leg_uniq.append(p)

output = open('results/leg_blastp_hits.txt', 'w')
output.write('id' + '\t' + 'blastp_hit' + '\t' + 'e-value' + '\t' +
             'ident_pct' + '\n')
for p in leg_uniq:
    qid = '%s' % p[0]
    hit = '%s' % p[1]
    ev = '%s' % p[2]
    per = '%s' % p[3]
Beispiel #11
0
        def iterator(blast_dict):
            for entry in blast_dict:
                entry_hits = []
                for hit in blast_dict[entry].hits:
                    if hit.id not in black_list:
                        # filter hits
                        entry_hits.append(hit)
                if entry_hits:
                    yield QueryResult(hits=entry_hits, id=entry)

    elif args.mode == "both":

        def iterator(blast_dict):
            for entry in blast_dict:
                if entry not in black_list:
                    entry_hits = []
                    for hit in blast_dict[entry].hits:
                        if hit.id not in black_list:
                            # filter hits
                            entry_hits.append(hit)
                    if entry_hits:
                        yield QueryResult(hits=entry_hits, id=entry)


blast_results = SearchIO.index(args.input, args.format)

SearchIO.write(iterator(blast_results), args.output, args.format)
if args.output != "output":
    out_fd.close()
if args.input != "stdin":
    in_fd.close()
        sys_exit("Missing BLAST database for %s" % db_fasta)
    cmd = NcbiblastnCommandline(query=query_fasta,
                                db=db_fasta,
                                out=blast_file,
                                outfmt=6,
                                evalue=1e-5)
    print cmd
    stdout, stderr = cmd()
    return


if not os.path.isfile(blast_file):
    do_blast(assembly_fasta, reference_fasta, blast_file)

contigs = SeqIO.index(assembly_fasta, "fasta")
blast_results = SearchIO.index(blast_file, "blast-tab")

max_len = 0
for record in SeqIO.parse(reference_fasta, "fasta"):
    max_len += SPACER + len(record)
max_len -= SPACER
if os.path.isfile(reference_genbank):
    reference_parser = SeqIO.parse(reference_genbank, "genbank")
else:
    reference_parser = SeqIO.parse(reference_fasta, "fasta")

if output_fasta:
    sys.stderr.write(
        "WARNING - Consider using order_assembly.py instead for FASTA output\n"
    )
    fasta_handle = open(output_fasta, "w")
Beispiel #13
0
filename = "substrates.txt"
with open(filename) as src:
    p = [line.split('\r') for line in src][0]
legdict = {}
for line in p:
    pp = line.split()
    lid = pp[0].lower(),
    alias = pp[1]
    legdict[lid[0]] = {
        "lid": lid[0],
        "alias": alias}

from Bio import SearchIO
result_handle = open("results/blastp_resultAD.txt")
# comments=True)
idx = SearchIO.index('results/blastp_resultAD.txt', 'blast-tab')
idss = (idx.keys())
homs = {}
rec_list_all = []

for ids in idss:
    rec_list = []
    for rec in idx[ids].hsps:
        rec_list.append((rec.hit_id, rec.evalue, rec.ident_pct))
        rec_list_all.append(
            (rec.query_id, rec.hit_id, rec.evalue, rec.ident_pct))
    homs[ids] = {ids: rec_list}
leg_uniq = []
ids = []
for p in sorted(rec_list_all, key=lambda x: x[0][1]):
    if p[2] < 10E-3:
Beispiel #14
0
###

filename = "substrates.txt"
with open(filename) as src:
    p = [line.split('\r') for line in src][0]
legdict = {}
for line in p:
    pp = line.split()
    lid = pp[0].lower(),
    alias = pp[1]
    legdict[lid[0]] = {"lid": lid[0], "alias": alias}

from Bio import SearchIO
result_handle = open("results/blastp_resultAD.txt")
# comments=True)
idx = SearchIO.index('results/blastp_resultAD.txt', 'blast-tab')
idss = (idx.keys())
homs = {}
rec_list_all = []

for ids in idss:
    rec_list = []
    for rec in idx[ids].hsps:
        rec_list.append((rec.hit_id, rec.evalue, rec.ident_pct))
        rec_list_all.append(
            (rec.query_id, rec.hit_id, rec.evalue, rec.ident_pct))
    homs[ids] = {ids: rec_list}
leg_uniq = []
ids = []
for p in sorted(rec_list_all, key=lambda x: x[0][1]):
    if p[2] < 10E-3:
Beispiel #15
0
def parallel_BLAST(list_of_genes, seqindex, split_by, out):
    """
    Run a BLASTp all-vs.-all search on a subset of genes from a database.

    Splits queries into separate files by a divisor split_by, BLASTs them
    simultaneously and then concatenates them using cat. Currently uses
    subprocess over BioPython's BLASTp wrapper for easy parallelization
    using mp.Pool and subprocess_BLAST. Returns a SearchIO BLAST tabular object.

    Requires cat, makeblastdb and blastp in your $PATH!

    Arguments:
        list_of_genes = List of (remaining) noncore protein IDs.
        seqindex      = SeqIO.index of all proteins.
        split_by      = Divisor to split files into.
        out           = File prefix for results files given current cluster size being investigated.
    """

    ##### Generate FASTA database of (remaining) noncore proteins. #####
    outfast = open(out, "w")
    for seq in list_of_genes:
        outfast.write(">{0}\n{1}\n".format(seqindex[seq].id,
                                           seqindex[seq].seq))
    outfast.close()  # Close file here, makeblastdb has problems otherwise.
    subblastlog.write("Wrote {0} sequences to {1}...\n".format(
        len(list_of_genes), out))

    ##### Run makeblastdb on FASTA database. #####
    sp.call([
        "makeblastdb", "-in", out, "-dbtype", "prot", "-out",
        "{0}.db".format(out)
    ],
            stdout=subblastlog)

    ##### Split FASTA database by split_by and generate list of BLASTp commands. #####
    count = 0
    query_cmds = []  # Handle for simultaneous BLASTp commands.
    to_split = SeqIO.parse("{0}".format(out), "fasta")
    for part in grouper(to_split, int(round(len(list_of_genes) / split_by))):
        count = count + 1
        seqs = filter(lambda x: x is not None, part)  # Remove fill values.
        if glob("{0}.part{1}.faa".format(
                out, count)):  # Remove previous versions of file if present.
            os.remove("{0}.part{1}.faa".format(out, count))
        SeqIO.write(seqs, "{0}.part{1}.faa".format(out, count), "fasta")
        query_cmds.append([
            "blastp", "-query", "{0}.part{1}.faa".format(out, count), "-db",
            "{0}.db".format(out), "-outfmt", "6 std qlen slen", "-evalue",
            "0.0001", "-out", "{0}.part{1}.subblast".format(out, count),
            "-num_threads", "1"
        ])
    subblastlog.write(
        "Split original query file {0} ({1} sequences) into {2} files.\n".
        format(out, len(list_of_genes), str(split_by)))

    ##### Run BLASTp processes simultaneously using mp.Pool. #####
    farm = mp.Pool(processes=split_by)
    farm.map(subprocess_BLAST, query_cmds)
    farm.close()
    subblastlog.write(
        "Finished BLAST+ searches for {0} ({1} sequences), split into {2} files.\n"
        .format(out, len(list_of_genes), str(split_by)))

    ##### Concatenate parallel_BLAST results together in the shell and remove other files. #####
    sp.call(["cat"] + glob("*subblast"),
            stdout=open("{0}.results".format(out), "wb"))
    for bin_file in glob("%s.db*" % out):
        os.remove(bin_file)
    for part_file in glob("%s.part*" % out):
        os.remove(part_file)
    os.remove("%s" % out)
    subblastlog.write(
        "Concatenated BLAST+ output for {0} ({1} sequences).\n".format(
            out, len(list_of_genes), str(split_by)))

    ##### Parse parallel_BLAST results and return them as a SearchIO.index instance to cluster_clean. #####
    blast = SearchIO.index("{0}.results".format(out),
                           "blast-tab",
                           fields=blast_fields)
    subblastlog.write(
        "Loaded {0} as SearchIO.index for cluster_clean.\n".format(out))
    return blast
Beispiel #16
0
def cluster_clean(panoct_clusters,
                  fasta_handle,
                  split_by=4,
                  min_id_cutoff=30,
                  strain_cutoff=1.0,
                  iterations=1):
    """
    Tidy up non-core clusters found by PanOCT.

    Feeds into parallel_BLAST, which expects you to have BLAST+ installed.
    Also feeds into gap_finder, which doesn't require anything else.
    """
    ##### Load in FASTA database and PanOCT results. #####
    db = SeqIO.index(fasta_handle, "fasta")
    full_blast = SearchIO.index("blast_results.txt", "blast-tab")
    matchtable = reader(open(panoct_clusters), delimiter="\t")

    ##### Initialize empty dictionaries for cluster types. #####
    core = {}
    noncore = {}
    softcore = {}

    ##### Initialize variables for total/starting number of genomes. #####
    total = 0
    start = 0

    ##### Populate core & noncore dictionaries by reading PanOCT results. #####
    for row in matchtable:
        if "----------" in row:
            noncore[row[0]] = row[1:]  # Populating our initial noncore dict.
        else:
            core[row[0]] = row[1:]  # Populating our core dict.
            if total == 0:
                total = len(row) - 1  # Total number of genomes.
                print total
                start = total - 1  # Max noncore cluster size.

    mainlogfile.write(
        "{0} core clusters and {1} noncore clusters identified...\n".format(
            len(core), len(noncore)))

    #### Run parallel_BLAST and gap finding for n iterations. #####
    for iteration in range(0, iterations, 1):
        mainlogfile.write("Running iteration {0}...\n".format(iteration + 1))
        ##### Loop through noncore clusters from size (total -1) to 2. #####
        for size in range(start, 0, -1):
            filled_count = 0
            merged_count = 0

            ##### Get list of (remaining) noncore protein IDs. #####
            to_blast = filter(lambda x: x != "----------",
                              flatten([noncore[key] for key in noncore]))
            mainlogfile.write("All-vs.-all BLAST of {0} proteins...\n".format(
                len(to_blast)))

            ##### Run parallel_BLAST. #####
            results = parallel_BLAST(
                to_blast, db, split_by,
                "ClusterBLAST_{0}.fasta".format(str(size)))
            outfast = open("ClusterBLAST_{0}.fasta".format(str(size)), "w")
            for seq in to_blast:
                outfast.write(">{0}\n{1}\n".format(db[seq].id, db[seq].seq))
            #results = SearchIO.index("ClusterBLAST_{0}.fasta.results".format(str(size)), "blast-tab",
            #                         fields=blast_fields)
            mainlogfile.write(
                "Finding potential homology gaps in clusters of size {0}...\n".
                format(str(size)))

            ##### Run gap_finder. #####
            gaps = gap_finder(results, db, noncore, total, size, min_id_cutoff,
                              strain_cutoff)

            ##### Identify clusters that need to be merged and move merged clusters to appropriate dictionary. #####
            for cluster in gaps:
                if cluster in noncore:
                    cluster_strains = [
                        i.split("|")[0] for i in filter(
                            lambda x: x != "----------", noncore[cluster])
                    ]
                    for candidate in gaps[cluster]:
                        if candidate in noncore:
                            candidate_strains = [
                                i.split("|")[0]
                                for i in filter(lambda x: x != "----------",
                                                noncore[candidate])
                            ]
                            if len(
                                    set(candidate_strains)
                                    & set(cluster_strains)) == 0:
                                merge_size = (
                                    len(
                                        filter(lambda x: x != "----------",
                                               noncore[cluster])) +
                                    len(
                                        filter(lambda x: x != "----------",
                                               noncore[candidate])))
                                if merge_size == total:
                                    mainlogfile.write(
                                        "{0} (size: {1}) has a homologous cluster: {2} (size: {3})\n"
                                        .format(
                                            cluster,
                                            len(
                                                filter(
                                                    lambda x: x !=
                                                    "----------",
                                                    noncore[cluster])),
                                            candidate,
                                            len(
                                                filter(
                                                    lambda x: x !=
                                                    "----------",
                                                    noncore[candidate]))))
                                    mainlogfile.write(
                                        "Merging smaller cluster {0} into larger cluster {1}...\n"
                                        .format(candidate, cluster))
                                    mainlogfile.write(
                                        "Merged cluster {0} has size {1}.\n".
                                        format(cluster, merge_size))
                                    filled = merge_clusters(
                                        noncore[cluster], noncore[candidate])
                                    softcore[cluster] = filled
                                    del noncore[cluster], noncore[candidate]
                                    filled_count = filled_count + 2
                                elif merge_size < total:
                                    mainlogfile.write(
                                        "{0} (size: {1}) has a homologous cluster: {2} (size: {3})\n"
                                        .format(
                                            cluster,
                                            len(
                                                filter(
                                                    lambda x: x !=
                                                    "----------",
                                                    noncore[cluster])),
                                            candidate,
                                            len(
                                                filter(
                                                    lambda x: x !=
                                                    "----------",
                                                    noncore[candidate]))))
                                    mainlogfile.write(
                                        "Merging smaller cluster {0} into larger cluster {1}...\n"
                                        .format(candidate, cluster))
                                    mainlogfile.write(
                                        "Merged cluster {0} has size {1}.\n".
                                        format(cluster, merge_size))
                                    merged = merge_clusters(
                                        noncore[cluster], noncore[candidate])
                                    noncore[cluster] = merged
                                    del noncore[candidate]
                                    merged_count = merged_count + 2

            mainlogfile.write(
                "At cluster size (n = {0}): merged {1} homologous clusters into {2} softcore clusters.\n"
                .format(size, filled_count, filled_count / 2))
            mainlogfile.write(
                "At cluster size (n = {0}): merged {1} homologous clusters into {2} noncore clusters.\n"
                .format(size, merged_count, merged_count / 2))

            if not os.path.isdir("{0}/sub_BLASTs".format(os.getcwd())):
                os.makedirs("{0}/sub_BLASTs/faa".format(os.getcwd()))
                os.makedirs("{0}/sub_BLASTs/results".format(os.getcwd()))
            for sub_faa in glob("ClusterBLAST_*.fasta"):
                os.rename(
                    sub_faa,
                    "{0}/sub_BLASTs/faa/{1}".format(os.getcwd(), sub_faa))
            for sub_results in glob("*.results"):
                os.rename(
                    sub_results, "{0}/sub_BLASTs/results/{1}".format(
                        os.getcwd(), sub_results))

    with open("new_matchtable.txt", "w") as outmatch:
        for cluster in core:
            outmatch.write("{0}\t{1}\n".format(cluster,
                                               "\t".join(core[cluster])))
        for cluster in softcore:
            outmatch.write("{0}\t{1}\n".format(cluster,
                                               "\t".join(softcore[cluster])))
        for cluster in noncore:
            outmatch.write("{0}\t{1}\n".format(cluster,
                                               "\t".join(noncore[cluster])))

    with open("new_softtable.txt", "w") as outsofmatch:
        for cluster in softcore:
            outsofmatch.write("{0}\t{1}\n".format(cluster, "\t".join(
                softcore[cluster])))

    with open("new_nontable.txt", "w") as outnonmatch:
        for cluster in noncore:
            outnonmatch.write("{0}\t{1}\n".format(cluster,
                                                  "\t".join(noncore[cluster])))

    with open("softcore_pam.txt", "w") as outsof:
        for cluster in softcore:
            pa = []
            for el in softcore[cluster]:
                if el == "----------":
                    pa.append("0")
                else:
                    pa.append("1")
            outsof.write("{0}\n".format("\t".join(pa)))

    with open("noncore_pam.txt", "w") as outnon:
        for cluster in noncore:
            pa = []
            for el in noncore[cluster]:
                if el == "----------":
                    pa.append("0")
                else:
                    pa.append("1")
            outnon.write("{0}\n".format("\t".join(pa)))

    sizes_arg = []
    counts_arg = []
    n_sizes = Counter([
        len(filter(lambda x: x != "----------", noncore[cluster]))
        for cluster in noncore
    ])

    for n_size in n_sizes:
        if int(n_size) < 10:
            sizes_arg.append("n0" + str(n_size))
        else:
            sizes_arg.append("n" + str(n_size))
        counts_arg.append(str(n_sizes[n_size] * int(n_size)))

    core_count = len(flatten(core.values())) + len(
        filter(lambda x: x != "----------", flatten(softcore.values())))
    sizes_arg.append("n" + str(total))
    counts_arg.append(str(core_count))

    core_proteome = len(flatten(core.values()))
    softcore_proteome = len(
        filter(lambda x: x != "----------", flatten(softcore.values())))
    noncore_proteome = len(
        filter(lambda x: x != "----------", flatten(noncore.values())))

    mainlogfile.write("====Core: {0} clusters, {1} proteins."
                      "Softcore: {2} clusters, {3} proteins."
                      "Accessory: {4} clusters, {5} proteins.\n====".format(
                          core.keys(), core_proteome, softcore.keys(),
                          softcore_proteome, noncore.keys(), noncore_proteome))

    ring_plot = [
        "Rscript", "{0}/PlotRingChart.R".format(dirname),
        str(core_proteome),
        str(softcore_proteome),
        str(noncore_proteome), ",".join(size for size in sizes_arg),
        ",".join(count for count in counts_arg)
    ]
    try:
        sp.check_call(ring_plot)
        mainlogfile.write("Creating ring chart in R...\n")
    except sp.CalledProcessError as r_exec:
        if r_exec.returncode != 0:
            mainlogfile.write(
                "Unable to run R script PlotRingChart.R, attempted command below:\n"
            )
            mainlogfile.write(" ".join(ring_plot) + "\n")

    upset_plot = [
        "Rscript", "PlotUsingUpSet.R", "softcore_pam.txt", "softcore_upset.eps"
    ]
    try:
        sp.check_call(upset_plot)
        mainlogfile.write("Creating upset plot of softcore clusters in R...\n")
        upset_plot = [
            "Rscript", "PlotUsingUpSet.R", "noncore_pam.txt",
            "noncore_upset.eps"
        ]
        try:
            sp.check_call(upset_plot)
        except sp.CalledProcessError as r_exec:
            if r_exec.returncode != 0:
                mainlogfile.write(
                    "Unable to run R script PlotUsingUpSet.R. Run command manually:\n"
                )
                mainlogfile.write(" ".join(upset_plot) + "\n")
    except sp.CalledProcessError as r_exec:
        if r_exec.returncode != 0:
            mainlogfile.write(
                "Unable to run R script PlotUsingUpSet.R. Run command manually:\n"
            )
            mainlogfile.write(" ".join(upset_plot) + "\n")

    mainlogfile.write(
        "Remaining noncore clusters after prediction analysis: {0}\n".format(
            len(noncore)))
    mainlogfile.write(
        "prediction analysis finished in {0} seconds. Thank you for choosing prediction, the friendly pangenome software.\n"
        .format(time.time() - start_time))
    mainlogfile.write("=== Finished prediction job at {0}. ===\n".format(
        str(datetime.datetime.now())))
Beispiel #17
0
from Bio import SearchIO
result_handle = open("results/blastp_resultAD.txt")
idx = SearchIO.index('results/blastp_resultAD.txt', 'blast-tab')# comments=True)
idss = (idx.keys())
homs = {}
rec_list_all = []

for ids in idss:
    rec_list = []
    for rec in idx[ids].hsps:
        rec_list.append((rec.hit_id, rec.evalue, rec.ident_pct))
        rec_list_all.append((rec.query_id, rec.hit_id, rec.evalue, rec.ident_pct))
    homs[ids] = {ids: rec_list}
leg_uniq = []
ids = []
for p in sorted(rec_list_all, key=lambda x: x[0][1]):
    if p[2] < 10E-3 and p[1] not in p[0]:
        if p[1] not in ids:
            ids.append(p[1])
            leg_uniq.append(p)

output = open('results/leg_blastp_hits.txt', 'w')
output.write('id'+'\t'+'blastp_hit'+ '\t' + 'e-value' + '\t' + 'ident_pct' + '\n')
for p in leg_uniq:
    qid = '%s' % p[0]
    hit = '%s' % p[1]
    ev = '%s' % p[2]
    per = '%s' % p[3]
    output.write(qid + "\t" + hit + "\t" + ev + '\t' + per + "\n")
output.close()
Beispiel #18
0
def FillGaps(blast, matchtable, seqs, tags):
    """
    Try to fill in gaps in syntenic clusters that might have arisen via genomic events and/or assembly artefacts.
    """
    # Load core and accessory cluster sets, BLAST+ data and sequence data.
    core, acc = ParseMatchtable(matchtable)
    new_clusters = {}
    if acc:
        searches = SearchIO.index(blast, "blast-tab")
        tags = [line.strip("\n") for line in open(tags)]

        # Loop over every accessory cluster.
        og_acc = acc.keys()
        ignore = []
        for q_cluster_id in og_acc:
            print "{0} out of {1} clusters searched".format(
                og_acc.index(q_cluster_id), len(og_acc))
            if q_cluster_id not in ignore:
                current_acc = [key for key in acc.keys() if key not in ignore]
                if q_cluster_id in current_acc:
                    q_cluster = acc[q_cluster_id]
                    q_pos = [pos for pos, gene in enumerate(q_cluster) if gene]
                    q_present = set([tags[pos] for pos in q_pos])
                    q_members = set(
                        sorted(filter(lambda x: x is not None, q_cluster)))
                    q_missing = set(
                        filter(lambda tag: tag not in q_present, tags))
                    q_blasts = QueryClusterFirstHits(q_cluster, searches, 30,
                                                     q_missing)
                    q_first_hits = set(
                        filter(lambda x: x is not None,
                               Flatten(q_blasts.values())))
                    q_query = MultipleInsert(list(q_first_hits), tags)
                    if q_query in acc.values():
                        s_cluster_id = acc.keys()[acc.values().index(q_query)]
                        if s_cluster_id not in ignore:
                            s_cluster = acc[s_cluster_id]
                            s_members = set(
                                sorted(
                                    filter(lambda x: x is not None,
                                           s_cluster)))
                            if s_members == q_first_hits:
                                s_present = set(
                                    [gene.split("|")[0] for gene in s_members])
                                s_missing = set(
                                    filter(lambda tag: tag not in s_present,
                                           tags))
                                s_blasts = QueryClusterFirstHits(
                                    s_cluster, searches, 30, s_missing)
                                s_first_hits = set(
                                    filter(lambda x: x is not None,
                                           Flatten(s_blasts.values())))
                                reciprocal = Reciprocal(
                                    q_members, q_first_hits, s_members,
                                    s_first_hits)
                                if reciprocal:
                                    new_cluster = ClusterMerge(
                                        q_cluster, s_cluster)
                                    new_clusters[q_cluster_id] = new_cluster
                                    acc.pop(q_cluster_id, "None")
                                    acc.pop(s_cluster_id, "None")
                                    print "clusters merged: {0} {1}\n".format(
                                        str(q_cluster_id), str(s_cluster_id))
                                    print "size of clusters merged: {0} {1}\n".format(
                                        len(q_members), len(s_members))
                                    ignore = ignore + [
                                        q_cluster_id, s_cluster_id
                                    ]
            else:
                pass
    else:
        pass

    # Write new matchtable to file.
    with open("refined_matchtable.txt", "w") as out:
        if acc:
            dataset = [core, acc, new_clusters]
        else:
            dataset = [core]
        for comp in dataset:
            for cluster in comp:
                line = [
                    "----------" if not a else str(a) for a in comp[cluster]
                ]
                out.write("\t".join(line) + "\n")