def predict_probes(sequences,taxonomy,args): # set verbose global VERBOSE VERBOSE = args.verbose # Zero, find sequences that belong to the selected clade if VERBOSE > 2: UTIL_log.print_log("Identify sequences from the selected clade") seq_sel_clade, seq_other = split_sequences(taxonomy,args.sel_clade,sequences) # First, identify possible conserved regions if VERBOSE > 2: UTIL_log.print_log("Identify k-mers for the query clade") kmers_recall,kmers_precision = find_conserved_regions(seq_sel_clade,args.probe_len,args.perc_seq) # Second, check if identified regions are unique, compared to the other # clades (~ evaluating precision) if VERBOSE > 2: UTIL_log.print_log("Check if the identified k-mers are present in the other clades") other_sel_clades = check_uniqueness(kmers_precision,seq_other,args.probe_len) # Third, prioritize selected probes if VERBOSE > 2: UTIL_log.print_log("Prioritize selected probes") probe_order = priotitize_probes(kmers_recall,kmers_precision,len(seq_sel_clade)) # print/save to outfile if VERBOSE > 2: UTIL_log.print_log("Save the result") save_result(probe_order, args.outfile,len(seq_sel_clade),kmers_recall,kmers_precision)
def save_file(lines, outfile): # set temp file try: if outfile is None: temp_file = sys.stdout else: temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w") os.chmod(temp_file.name, 0o644) except Exception as e: UTIL_log.print_error("couldn't create the file. Message:", exit=False) UTIL_log.print_error(str(e)) # write lines -------------------------------------------------------------- for l in lines: temp_file.write(l) # close and move to final destination -------------------------------------- if not outfile is None: try: temp_file.flush() os.fsync(temp_file.fileno()) temp_file.close() shutil.move(temp_file.name, outfile) except Exception as e: UTIL_log.print_error("couldn't save the file:", exit=False) UTIL_log.print_error(str(e))
def test_probe(): # HEADER ------------------------------------------------------------------- sys.stderr.write("\n") sys.stderr.write(UTIL_log.colour("Usage: ", "cyan")) sys.stderr.write("fish_probes test_probe ") sys.stderr.write(UTIL_log.colour("-i ", "blue")) sys.stderr.write("<probe_seq> ") sys.stderr.write(" [option]\n\n") # PARAMETERS --------------------------------------------------------------- sys.stderr.write(UTIL_log.colour(" -i ", "blue")) sys.stderr.write("STR ") sys.stderr.write("probe to test") sys.stderr.write(UTIL_log.colour("\n", "magenta")) sys.stderr.write(UTIL_log.colour(" -sp ", "blue")) sys.stderr.write(" ") sys.stderr.write("Evaluate also the two half of the probes alone") sys.stderr.write(UTIL_log.colour("\n", "magenta")) sys.stderr.write(UTIL_log.colour(" -v ", "blue")) sys.stderr.write("INT ") sys.stderr.write( "verbose level: 1=error, 2=warning, 3=message, 4+=debugging") sys.stderr.write(UTIL_log.colour(" [3]\n", "magenta"))
def main_message(tool_version): str_msg = ''' \00 ''' # HEADER ------------------------------------------------------------------- str_msg = str_msg + UTIL_log.colour("Program: ", "cyan") str_msg = str_msg + "fish_probes - a tool to design FISH probes for 16S sequences\n" str_msg = str_msg + UTIL_log.colour("Version: ", "cyan") str_msg = str_msg + str(tool_version) + "\n" str_msg = str_msg + UTIL_log.colour("Usage: ", "cyan") str_msg = str_msg + "fish_probes <command> [options]\n\n" str_msg = str_msg + UTIL_log.colour("Command:\n", "cyan") # COMMANDS ----------------------------------------------------------------- str_msg = str_msg + UTIL_log.colour(" design ", "green_bold") str_msg = str_msg + "Identify suitable probes for a given clade\n" str_msg = str_msg + UTIL_log.colour(" check_probe ", "green_bold") str_msg = str_msg + "Check physical properties of a probe\n" str_msg = str_msg + UTIL_log.colour(" test ", "green_bold") str_msg = str_msg + "Test the tool\n" # CONCLUDING --------------------------------------------------------------- str_msg = str_msg + "\nType fish_probes <command> to print the help for a specific command" return str_msg
def load_taxonomy(taxonomy_file): try: o = open(taxonomy_file, "r") except: UTIL_log.print_error("Cannot load the taxonomy file", 5) # save result to a dict result = dict() # load file for line in o: vals = line.rstrip().split("\t") result[vals[0]] = vals[1] o.close() if VERBOSE > 2: UTIL_log.print_message("Found taxonomy information for " + str(len(result)) + " sequences.\n") return result
def find_conserved_regions(seq_sel_clade,k,perc_seq_with_kmer): all_strings_kmers = dict() for s in seq_sel_clade: allK,allK_N = find_kmers(seq_sel_clade[s],k) all_strings_kmers[s] = allK # find all possible k-mers all_kmers = set() for s in all_strings_kmers: for kmer in all_strings_kmers[s]: all_kmers.add(kmer) if VERBOSE > 2: UTIL_log.print_message("Identifed "+str(len(all_kmers))+" unique "+str(k)+"-mers.") # now we count how many times it appear count_mers = dict() for kmer in list(all_kmers): count_mers[kmer] = 0 # now we add the counts per k-mer for s in all_strings_kmers: for kmer in all_strings_kmers[s]: count_mers[kmer] = count_mers[kmer] + 1 # we check which k-mers covers all sequences n_seq = len(seq_sel_clade) kmers_recall = dict() # this will be filled in by "check_uniqueness" kmers_precision = dict() list_identical = list() for kmer in count_mers: if count_mers[kmer] == n_seq: list_identical.append(kmer) # used only to print if count_mers[kmer] > n_seq*perc_seq_with_kmer: kmers_recall[kmer] = count_mers[kmer] kmers_precision[kmer] = 0 if VERBOSE > 2: UTIL_log.print_message(" (Identifed "+str(len(list_identical))+" "+str(k)+"-mers present in all sequences)") UTIL_log.print_message(str(len(kmers_precision))+" "+str(k)+"-mers will go to the next step.") UTIL_log.print_message("(only k-mers present in at least "+str(perc_seq_with_kmer*100)+"% of the sequences will be used).\n") if len(kmers_precision) == 0: if VERBOSE > 1: UTIL_log.print_warning("No k-mers passed the filter. Please decrease the threshold in -p") # return return kmers_recall,kmers_precision
def check_uniqueness(kmers_precision, seq_other, probe_len): n_matching_to_other = 0 # we check if the kmers are covered by other sequences other_sel_clades = dict() for s in seq_other: this_kmers,this_kmers_N = find_kmers(seq_other[s],probe_len) for kmer in this_kmers: if kmer in kmers_precision: kmers_precision[kmer] = kmers_precision[kmer] + 1 if not kmer in other_sel_clades: other_sel_clades[kmer] = list() other_sel_clades[kmer].append(s) n_matching_to_other = n_matching_to_other + 1 # we need to evaluate the ones with an N or others for kmer in this_kmers_N: dummy = "TODO" if VERBOSE > 2: UTIL_log.print_message("The selected probes map to other "+str(n_matching_to_other)+" sequences.\n") return other_sel_clades
def split_sequences(taxonomy,sel_clade,sequences): seq_sel_clade = dict() seq_other = dict() for seq in taxonomy: # we check if any of the selected clades is present to_be_added = False for s_c in sel_clade: if s_c in taxonomy[seq].split(";"): to_be_added = True # we add it to the correct if to_be_added : seq_sel_clade[seq] = sequences[seq] else: seq_other[seq] = sequences[seq] if VERBOSE > 2: UTIL_log.print_message("Sequences belonging to the selected clade: "+str(len(seq_sel_clade))+".") UTIL_log.print_message("Sequences belonging to other clades: "+str(len(seq_other))+".\n") return seq_sel_clade, seq_other
def check_input(sequences, taxonomy, args): # check that the clade is in the taxonomy found_clade = False for seq in taxonomy: for clade in args.sel_clade: if clade in taxonomy[seq].split(";"): found_clade = True if not found_clade: # none of the clades in the list is present UTIL_log.print_error("Selected clade(s) not present in the taxonomy") # check that we have a taxonomy annotation for each sequence # NOTE: it can be that there are more taxonomy entries than sequences, but # not the contrary for seq in sequences: if not seq in taxonomy: UTIL_log.print_error("Sequence '" + seq + "' does not have a taxonomy") # Remove entries from the taxonomy, if there are no corresponding sequences to_remove = list() for seq in taxonomy: if not seq in sequences: to_remove.append(seq) for r in to_remove: del taxonomy[r] if len(to_remove) > 0: if VERBOSE > 2: UTIL_log.print_message( "Removed " + str(len(to_remove)) + " taxonomy line(s) because no sequence was present.\n")
def load_sequences(sequences_file): try: o = open(sequences_file, "r") except: UTIL_log.print_error("Cannot load the fasta file with the sequences") # save result to a dict result = dict() # load file assuming is a fasta file first_line = o.readline() if not (first_line.startswith(">")): UTIL_log.print_error("Not a fasta file", 4) else: header = first_line.rstrip()[1:] temp_sequence = "" # go through each line for line in o: if line.startswith(">"): result[header] = temp_sequence header = line.rstrip()[1:] temp_sequence = "" else: temp_sequence = temp_sequence + line.rstrip() # we write the last line result[header] = temp_sequence o.close() if VERBOSE > 2: UTIL_log.print_message("Found " + str(len(result)) + " sequences.\n") return result
def main(): # load sys.argv args = UTIL_arg_parser.input_parser(tool_version) if args.verbose > 2: UTIL_log.print_message("Call: ") UTIL_log.print_message(" ".join(sys.argv)+"\n") # find probes -------------------------------------------------------------- if args.command == "design": sequences,taxonomy = C_load_input.load_and_check_input(args) C_predict_probes.predict_probes(sequences,taxonomy,args) # test a given probe ------------------------------------------------------- if args.command == "check_probe": print(UTIL_probe.create_to_print(args.input,header = True,split = args.split_probe)) # test the tool ------------------------------------------------------------ if args.command == "test": C_test.test() sys.exit(0)
def priotitize_probes(kmers_recall,kmers_precision,n_seq_clade): if VERBOSE > 2: UTIL_log.print_message("We order the probes by the number of wrong clades.") # find the order probe_order = list() missing = list() # best one: cover all sequences, and not in the other clade for p in list(kmers_recall.keys()): if kmers_recall[p]/n_seq_clade == 1: if kmers_precision[p] == 0: probe_order.append(p) else: missing.append(p) else: missing.append(p) if VERBOSE > 2: UTIL_log.print_message(str(len(probe_order))+" probe(s) present in all the selected clade(s) and have no contamination.\n") probe_order.extend(missing) return probe_order
def load_and_check_input(args): # set verbose global VERBOSE VERBOSE = args.verbose # load data from files if VERBOSE > 2: UTIL_log.print_log("Load sequences") sequences = load_sequences(args.sequences) if VERBOSE > 2: UTIL_log.print_log("Load taxonomy") taxonomy = load_taxonomy(args.taxonomy) # check that the input is correct if VERBOSE > 2: UTIL_log.print_log("Check input files") check_input(sequences, taxonomy, args) return sequences, taxonomy
def input_parser(tool_version): parser = argparse.ArgumentParser( usage=UTIL_print_menus.main_message(tool_version), formatter_class=CapitalisedHelpFormatter, add_help=False) # COMMAND parser.add_argument('command', action="store", default=None, help='command', choices=['design', 'check_probe', 'test']) # File with the sequences in fasta format parser.add_argument('-s', dest='sequences', action="store", default=None, help='File containing the sequences') # File with the taxonomy, the taxonomy is separated by ";", and there is a # "\t" between name and tax (Example: "gene1\tclade1;clade2;clade3\n") parser.add_argument('-t', dest='taxonomy', action="store", default=None, help='File containing the taxonomy') # Clade for which we have to find the primers parser.add_argument('-c', dest='sel_clade', action="store", default=None, help='Clade selected to design the primers', nargs="+") # Verbose level parser.add_argument('-v', action='store', type=int, default=3, dest='verbose', help='Verbose level: 1=error,'\ ' 2=warning, 3=message, 4+=debugging [3]') # Length of the probes that we identify parser.add_argument('-k', action='store', type=int, default=20, dest='probe_len', help='Probe length [20]') # Output file parser.add_argument('-o', action='store', default=None, dest='outfile', help='Output file [stdout]') # General input file parser.add_argument('-i', action='store', default=None, dest='input', help='General input') # Min fraction of sequences from the selected clade parser.add_argument( '-p', action='store', default=0.9, type=float, dest='perc_seq', help= 'Minimum fraction of sequences that should contain the selected probe [0.9]' ) # help parser.add_argument('-h', '--help', action='store_true', default=False, dest='help', help='Print help') # help parser.add_argument('-sp', '--split', action='store_true', default=False, dest='split_probe', help='Evaluate also a probe split in two') # version parser.add_argument('--version', action='version', version='%(prog)s {0} on python {1}'.format( tool_version, sys.version.split()[0])) args = parser.parse_args() ############################################################################ # CHECK ARGUMENTS # check args verbose if args.verbose < 1: UTIL_log.print_error("Verbose (-v) needs to be higher than 0") # check args perc seq if args.perc_seq < 0 or args.perc_seq > 1: UTIL_log.print_error("Threshold (-p) should be between 0 and 1") # check length of the probe if args.probe_len < 1: UTIL_log.print_error("Probe length (-k) cannot be lower than 0") ############################################################################ # CHECK ARGUMENTS FOR COMMAND DESIGN if args.command == "design": # print help if args.help: UTIL_print_menus.design() sys.exit(0) # there are three mandatory input if args.sequences is None: UTIL_print_menus.design() UTIL_log.print_error("Missing -s.") if args.taxonomy is None: UTIL_print_menus.design() UTIL_log.print_error("Missing -t.") if args.sel_clade is None: UTIL_print_menus.design() UTIL_log.print_error("Missing -c.") ############################################################################ # CHECK ARGUMENTS FOR COMMAND TEST_PROBE if args.command == "check_probe": # print help if args.help: UTIL_print_menus.test_probe() sys.exit(0) # there is only one mandatory input if args.input is None: UTIL_print_menus.test_probe() UTIL_log.print_error("Missing -i.") return args
def test(): UTIL_log.print_log("Prepare test") UTIL_log.print_message("Run test on /fish_probes/test/.") # prepare command ---------------------------------------------------------- to_run = "fish_probes design -c Firmicutes -k 7 -v1" to_run = to_run + " -s " + os.path.join(TEST_DATA_PATH, "seq.fa") to_run = to_run + " -t " + os.path.join(TEST_DATA_PATH, "tax") UTIL_log.print_message("Command:") UTIL_log.print_message(to_run + "\n") # we run the command ------------------------------------------------------- UTIL_log.print_log("Run test") try: from subprocess import DEVNULL except ImportError: DEVNULL = open(os.devnull, 'wb') popenCMD = shlex.split(to_run) cmd = subprocess.Popen(popenCMD, stdout=subprocess.PIPE, stderr=DEVNULL) # save the result result = list() for line in cmd.stdout: result.append(line.decode('ascii')) # check exit status cmd.stdout.close() return_code = cmd.wait() if return_code: UTIL_log.print_error("Tool failed") else: UTIL_log.print_message("Command completed correctly.\n") # we check the result ------------------------------------------------------ UTIL_log.print_log("Check result") # check that there is a header if not result[0].startswith("probe"): UTIL_log.print_error("Header incorrect:", exit=False) UTIL_log.print_error(result[0]) # check that the probe with the highest priority is correct if not result[1].startswith("CTCGATT"): UTIL_log.print_error("Predicted probe not correct", exit=False) UTIL_log.print_error(result[1]) UTIL_log.print_message("Result is correct.\n")
def design(): # HEADER ------------------------------------------------------------------- sys.stderr.write("\n") sys.stderr.write(UTIL_log.colour("Usage: ", "cyan")) sys.stderr.write("fish_probes design ") sys.stderr.write(UTIL_log.colour("-s ", "blue")) sys.stderr.write("<seq> ") sys.stderr.write(UTIL_log.colour("-t ", "blue")) sys.stderr.write("<tax> ") sys.stderr.write(UTIL_log.colour("-c ", "blue")) sys.stderr.write("<clade>") sys.stderr.write(" [option]\n\n") # PARAMETERS --------------------------------------------------------------- sys.stderr.write(UTIL_log.colour(" -s ", "blue")) sys.stderr.write("FILE ") sys.stderr.write("fasta file with the 16S sequences") sys.stderr.write(UTIL_log.colour("\n", "magenta")) sys.stderr.write(UTIL_log.colour(" -t ", "blue")) sys.stderr.write("FILE ") sys.stderr.write( "file with the taxonomy for the 16S sequences provided in -s") sys.stderr.write(UTIL_log.colour("\n", "magenta")) sys.stderr.write(UTIL_log.colour(" -c ", "blue")) sys.stderr.write("STR ") sys.stderr.write( "clade for which we need to find the probe (if more than one, separate with spaces)" ) sys.stderr.write(UTIL_log.colour("\n", "magenta")) sys.stderr.write(UTIL_log.colour(" -k ", "blue")) sys.stderr.write("INT ") sys.stderr.write("length of the probe") sys.stderr.write(UTIL_log.colour(" [20]\n", "magenta")) sys.stderr.write(UTIL_log.colour(" -o ", "blue")) sys.stderr.write("FILE ") sys.stderr.write("output file name") sys.stderr.write(UTIL_log.colour(" [stdout]\n", "magenta")) sys.stderr.write(UTIL_log.colour(" -p ", "blue")) sys.stderr.write("FLOAT ") sys.stderr.write( "minimum fraction of sequences that should contain the selected probe") sys.stderr.write(UTIL_log.colour(" [0.9]\n", "magenta")) sys.stderr.write(UTIL_log.colour(" -v ", "blue")) sys.stderr.write("INT ") sys.stderr.write( "verbose level: 1=error, 2=warning, 3=message, 4+=debugging") sys.stderr.write(UTIL_log.colour(" [3]\n", "magenta"))