def check_phage_functions(sample, blastfile, outputfile, clusterdb): """ Count how many proteins have hypothetical functions """ out = open(outputfile, 'w') if not os.path.exists(clusterdb): out.close() return phage_cluster_db = connect_to_db(clusterdb) hypo = 0 nonhypo = 0 with open(blastfile, 'r') as f: for l in f: p = l.strip().split("\t") fn = proteinid_to_function(p[1], phage_cluster_db) if is_hypothetical(fn): hypo += 1 else: nonhypo += 1 out.write(f"{sample}\tHypothetical proteins\t") out.write("[Hypothetical, Non-hypothetical, Fraction hypothetical]\t") out.write(f"{hypo}\t{nonhypo}\t{hypo / (hypo + nonhypo)}\n") out.close()
def print_all_proteins(): """ Print all the proteins to stdout :return: nothing """ con = connect_to_db(phagedb) protein_to_fasta(con) disconnect(con)
def list_all_genomes(): """ Print all the proteins to stdout :return: nothing """ con = connect_to_db(phagedb) exc = con.cursor().execute("select description from genome") for d in exc.fetchall(): print(f"{d[0]}") disconnect(con)
def enrich_a_cluster(clid, mems, phagedb, exout=None, verbose=False): """ Extract some information about each cluster and add it to the Cluster object :param clid: cluster id :param mems: the members of the cluster :param phagedb: the phage genome sqlite file :param exout: extended output file. If you want to add more data to the clusters (e.g. functions and lengths) :param verbose: more output :return: the modified cluster object """ conn = connect_to_db(phagedb, verbose) cur = conn.cursor() lens = [] shortest = [None, 10000] longest = [None, 0] functions = {} maxn = 500 c = 0 e = maxn eout = None if exout: eout = open(exout, 'a') while c <= len(mems): if c > 0: sys.stderr.write( f"{color.PINK}Retrieving clusters {c}:{e} for {clid}{color.ENDC}\n" ) tm = mems[c:e] protein_query = f"select accession, length, product from protein where accession in ({','.join(['?']*len(tm))})" cur.execute(protein_query, tm) c = e e += maxn for row in cur.fetchall(): if eout: r = "\t".join(map(str, row)) eout.write(f'{clid}\t{r}\n') lens.append(row[1]) if row[1] > longest[1]: longest = [row[0], row[1]] if row[1] < shortest[1]: shortest = [row[0], row[1]] functions[row[2]] = functions.get(row[2], 0) + 1 if eout: eout.close() return shortest[0], shortest[1], longest[0], longest[1], functions, sum( lens) / len(lens)
def lookup_word(word): """ Return the number of proteins with the word `word` in their product field :param word: the word to search for :param phagedb: the phage database connection :return : int the number of occurrences of word """ con = connect_to_db(phagedb) c = con.cursor() sql = "select count(1) from protein_fts where product match ?" ex = c.execute(sql, [word]) return ex.fetchone()[0]
import os import sys import argparse from pppf_accessories import color from pppf_databases import connect_to_db, disconnect if __name__ == '__main__': parser = argparse.ArgumentParser( description="Sequences in the phage database not in a cluster") parser.add_argument('-p', help='phage database', required=True) parser.add_argument('-c', help='cluster database', required=True) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() pbc = connect_to_db(args.p, args.v) pcur = pbc.cursor() dbc = connect_to_db(args.c, args.v) ccur = dbc.cursor() cl = set() ex = ccur.execute("select protein_md5sum, cluster from md5cluster") for (m, c) in ex.fetchall(): cl.add(m) if args.v: sys.stderr.write( f"{color.GREEN}Loaded {len(cl)} proteins{color.ENDC}\n") ex = pcur.execute(
parser = argparse.ArgumentParser(description='Find new accesssions') parser.add_argument('-f', help='input file of [gi, accession number]', required=True) parser.add_argument('-p', help='phage database', required=True) parser.add_argument('-o', help='file to write needed IDs to', required=True) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() # note that identifier has the version. # Thus identifier = AF068845.1 and accession = AF068845 # We probably want identifier! con = connect_to_db(args.p, args.v) exc = con.cursor().execute("select identifier, accession from genome") ids = {} accs = {} for r in exc.fetchall(): ids[r[0]] = r[1] accs[r[1]] = r[0] try: assert len(ids) == len(accs) except AssertionError as e: sys.stderr.write( f"{color.RED}FATAL: We found {len(ids)} identifiers and {len(accs)} accessions{color.ENDC}\n" ) sys.exit(1)
field lengths to define the database! """ import os import sys import argparse import pppf_db from pppf_databases import connect_to_db, disconnect __author__ = 'Rob Edwards' __copyright__ = 'Copyright 2020, Rob Edwards' __credits__ = ['Rob Edwards'] __license__ = 'MIT' __maintainer__ = 'Rob Edwards' __email__ = '*****@*****.**' for db in pppf_db.phagedb, pppf_db.clustersdb: print(f"Fields in {db}") con = connect_to_db(db) cursor = con.cursor() exc = cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") for tbltpl in exc.fetchall(): tbl = tbltpl[0] print(f"\tTable: {tbl}") cd = con.execute(f"select * from {tbl} limit 1") names = list(map(lambda x: x[0], cd.description)) for fld in names: lfsql = f"select length({fld}) from {tbl} order by length({fld}) DESC limit 1;" lfexc = cursor.execute(lfsql) print(f"{tbl} :: {fld} :: {lfexc.fetchone()[0]}")
Convert a protein ID to a dict object of all function :param proteinid: The protein md5 sum :param clusterdb_cursor: the cursor to the cluster database :param verbose: more output :return: dict: the functions of the protein and their frequency """ global protein_functions if proteinid not in protein_functions: protein_functions[proteinid] = get_functions(proteinid, clusterdb_cursor, verbose) return json.loads(protein_functions[proteinid][1]) if __name__ == '__main__': parser = argparse.ArgumentParser(description=" ") parser.add_argument('-i', help='protein id', required=True) parser.add_argument('-c', help='cluster database', required=True) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() c= connect_to_db(args.c, args.v) fn = proteinid_to_function(args.i, c.cursor(), args.v) fns = proteinid_to_all_functions(args.i, c.cursor(), args.v) fnstr = "\n".join([f"{x} -> {str(y)}" for x,y in sorted(fns.items(), key=lambda item: item[1], reverse=True)]) disconnect(c, args.v) print(f"The function of {args.i} is\n'{fn}'") print(f'All the functions are:\n{fnstr}')
if __name__ == '__main__': parser = argparse.ArgumentParser( description="Create a database and load it with GenBank data") parser.add_argument('-p', help='Phage SQL output database') parser.add_argument('-c', help='clusters SQLite database') parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() if args.p: sys.stderr.write( f"{color.BOLD}{color.BLUE}Defining Phage Tables{color.ENDC}\n") if not os.path.exists(args.p): with open(args.p, 'w') as out: True phageconn = connect_to_db(args.p, args.v) define_phage_tables(phageconn, args.v) phageconn.commit() # final commit to make sure everything saved! disconnect(phageconn, args.v) if args.c: sys.stderr.write( f"{color.BOLD}{color.BLUE}Defining Cluster Tables{color.ENDC}\n") if not os.path.exists(args.c): with open(args.c, 'w') as out: True clconn = connect_to_db(args.c, args.v) define_cluster_tables(clconn, args.v) clconn.commit() disconnect(clconn, args.v)
from pppf_accessories import color __author__ = 'Rob Edwards' __copyright__ = 'Copyright 2020, Rob Edwards' __credits__ = ['Rob Edwards'] __license__ = 'MIT' __maintainer__ = 'Rob Edwards' __email__ = '*****@*****.**' if __name__ == '__main__': parser = argparse.ArgumentParser(description=" ") parser.add_argument('-d', help='phage database', required=True) parser.add_argument('-v', help='verbose output', action='store_true') args = parser.parse_args() dbcon = connect_to_db(args.d, args.v) cur = dbcon.cursor() gene_query = "select gene_rowid, contig, start, end, protein from gene" exc = cur.execute(gene_query) if args.v: sys.stderr.write( f"{colour.GREEN}Reading gene locations{colour.ENDC}\n") firstgene = {} for tple in exc.fetchall(): contig = tple[1] if contig not in firstgene: firstgene[contig] = tple l = min(tple[2], tple[3]) if l < firstgene[contig][2] or l < firstgene[contig][2]:
""" """ import os import sys import argparse from pppf_databases import connect_to_db, disconnect from pppf_clusters import read_mmseqs_clusters, add_functions_to_clusters, insert_cluster_metadata, insert_into_database if __name__ == '__main__': parser = argparse.ArgumentParser(description='Load the cluster information into the databases') parser.add_argument('-p', '--phage', help='Phage SQL database', required=True) parser.add_argument('-c', '--clusters', help='Clusters SQL database', required=True) parser.add_argument('-t', '--tsv', help='Cluster tsv file', required=True) parser.add_argument('-n', '--name', help='Cluster name (short text)', required=True) parser.add_argument('-d', '--description', help='Cluster description (human readable text)', required=True) parser.add_argument('-c', '--cli', help='Cluster command line (bash)', required=True) parser.add_argument('-v', '--verbose', help='verbose output', action='store_true') args = parser.parse_args() phageconn = connect_to_db(args.phage, args.verbose) clconn = connect_to_db(args.clusters, args.verbose) clusters = read_mmseqs_clusters(args.tsv, args.verbose) (clusters, protein_info) = add_functions_to_clusters(clusters, phageconn, args.verbose) metadata_id = insert_cluster_metadata(clconn, args.name, args.description, args.cli, args.verbose) insert_into_database(clusters, clconn, phageconn, metadata_id, protein_info, args.verbose) disconnect(phageconn, args.verbose) disconnect(clconn, args.verbose)