print(line) # Filter self-matches, add to the database if line[0] != line[1]: resultsDB.addBLASTresult(line[0], line[1], line[2], line[3]) """ # Create the LSH tables in the results database resultsDB.createLSHtable("lshresults3") resultsDB.deleteTable("lshresults3") resultsDB.createLSHtable("lshresults3") for query in minhash3.minhashes.keys(): print(query) matches = minhash3.queryProtein(query) for match in matches: # Filter self-matches if query != match: jaccard = minhash3.estimateJaccard(query, match) resultsDB.addLSHresult(query, match, jaccard, "lshresults3") # Create the LSH tables in the results database resultsDB.createLSHtable("lshresults4") resultsDB.deleteTable("lshresults4") resultsDB.createLSHtable("lshresults4") for query in minhash4.minhashes.keys(): matches = minhash4.queryProtein(query) for match in matches: # Filter self-matches if query != match: jaccard = minhash4.estimateJaccard(query, match) resultsDB.addLSHresult(query, match, jaccard, "lshresults4") # Create the LSH tables in the results database
def run(self): print(\ """Local Sensitivity Hashing-based protein similarity search. Options: E[X]it, [L]oad Database, [D]elete Database, [C]alculate LSH, [RC] Recalculate LSH, [LL] Load LSH, [S]ave LSH [Q]uery LSH, Query [A]ll LSH, Read [B]LAST, Compare [R]esults, """) mode = input('Choose option:') uniDB = UniprotDB("Uniprot_DB.sqlite") minhash = LSH(0.5, 96) while (mode != 'Exit' and mode != 'X'): if (mode == 'Delete Database' or mode == 'D'): uniDB.deleteProteins() if (mode == 'Load Database' or mode == 'L'): protManager = ProteinsManager() uniDB.createTables() filename = input( 'XML filename (e.g. Ecolx.xml or PseA7.xml or Human.xml): ' ) protManager.loadProteins(filename, uniDB) if (mode == 'Calculate LSH' or mode == 'C'): uniDB = UniprotDB("Uniprot_DB.sqlite") proteins = uniDB.extractProteins() minhashes, lsh = minhash.calculateLSH(proteins, 3) print("Calculated") if (mode == 'Recalculate LSH' or mode == 'RC'): jaccardThreshold = float( input( "Specify a Jaccard similarity threshold (default: 0.5): " )) permutations = int( input( "Specify the number of permutations(default: 96) : ")) shinglesize = int( input("Specify the shingle size (default: 3): ")) minhash = LSH(jaccardThreshold, permutations) proteins = uniDB.extractProteins() minhashes, lsh = minhash.calculateLSH(proteins, shinglesize) print("Recalculated") if (mode == 'Query LSH' or mode == 'Q'): protein = input('Protein accession: ') start_time = time.time() result = minhash.queryProtein(protein) if result is not None: jaccResultsDict = minhash.checkJaccardResultsOfProtein( protein, result) # Return the results in sorted order, big to small Jaccard score sorted_jaccResultsDict = OrderedDict( sorted(jaccResultsDict.items(), key=lambda x: -x[1])) for jaccRes in sorted_jaccResultsDict.items(): print("\nMatch with Jaccard:", jaccRes[1]) information = uniDB.extractProteinInformation( jaccRes[0]) proteininfo = uniProtein(*information) proteininfo.printUniProtein(printSeq=False) print("Runtime of query search: %s seconds " % (time.time() - start_time)) if (mode == 'Calculate All' or mode == 'CA'): start_time = time.time() uniDB = UniprotDB("Uniprot_DB.sqlite") #uni_DB.close() proteins = uniDB.extractProteins() #minhash.calculateLSH([protein[1] for protein in proteins]) minhashes, lsh = minhash.calculateLSH(proteins, 3) for protein in proteins: print("Protein ", protein[0]) result = minhash.queryProtein(protein[0]) if result is not None: jaccResultsDict = minhash.checkJaccardResultsOfProtein( protein[0], result) sorted_jaccResultsDict = OrderedDict( sorted(jaccResultsDict.items(), key=lambda x: -x[1])) for jaccRes in sorted_jaccResultsDict.items(): print(jaccRes[0], " - Jaccard: ", jaccRes[1]) print("Runtime of query all: %s seconds " % (time.time() - start_time)) if (mode == 'Query All LSH' or mode == 'A'): resultsDB = ResultsDB("Results_DB.sqlite") resultsDB.createLSHtable("lshresults") resultsDB.deleteTable("lshresults") resultsDB.createLSHtable("lshresults") for query in minhash.minhashes.keys(): matches = minhash.queryProtein(query) for match in matches: # Filter self-matches if query != match: jaccard = minhash.estimateJaccard(query, match) resultsDB.addLSHresult(query, match, jaccard, "lshresults") print(resultsDB.extractLSHresults("lshresults")) if (mode == 'Read BLAST Results' or mode == 'B'): filename = input('Filename: ') handle = open(filename, 'r') resultsDB = ResultsDB("Results_DB.sqlite") resultsDB.createBLASTtable() resultsDB.deleteBLASTresults() resultsDB.createBLASTtable() for line in handle: line = line[:-1].split('\t') # Extract accessions from 'sp|A0A0R6L508|MCR1_ECOLX'-like string line[0] = line[0].split('|')[1] line[1] = line[1].split('|')[1] print(line) # Filter self-matches, add to the database if line[0] != line[1]: resultsDB.addBLASTresult(line[0], line[1], line[2], line[3]) print(resultsDB.extractBLASTresults()) if (mode == 'Compare Results' or mode == 'R'): # Database with all LSH and BLASTp results resultsDB = ResultsDB("Results_DB.sqlite") identity_th, alignment_th, jaccard_th = 80.0, 100, 0.5 precisions = [] recalls = [] # Load in all protein ids to loop over uniDB = UniprotDB("Uniprot_DB.sqlite") proteins = uniDB.extractProteins() # Store all precisions and recalls per query, to calculate the average for query in proteins: intersect = resultsDB.extractIntersectCountPerProtein( query[0], 'lshresults', identity_th, alignment_th, jaccard_th) lshresults = resultsDB.extractLSHcountPerProtein( query[0], 'lshresults', jaccard_th) blastresults = resultsDB.extractBLASTcountPerProtein( query[0], identity_th, alignment_th) tp = intersect fp = lshresults - intersect fn = blastresults - intersect precision = tp / (tp + fp) if (tp + fp) != 0 else -1 recall = tp / (tp + fn) if (tp + fn) != 0 else -1 # Exclude results without any similar proteins / division by zero if precision != -1: precisions.append(precision) if recall != -1: recalls.append(recall) print("Comparison of BLAST and LSH results:\n Number of proteins queried: %i \n Average precision: %0.3f Average recall: %0.3f\n" \ % (len(proteins), sum(precisions)/len(precisions), sum(recalls)/len(recalls))) if (mode == 'Save LSH' or mode == 'S'): number = int(input('Suffix number: ')) minhash.saveLSH(number) if (mode == 'Load LSH' or mode == 'LL'): number = int(input('Suffix number: ')) minhash.loadLSH(number) mode = input('Choose option: ')