def predict_probes(sequences,taxonomy,args):
    # set verbose
    global VERBOSE
    VERBOSE = args.verbose

    # Zero, find sequences that belong to the selected clade
    if VERBOSE > 2:
        UTIL_log.print_log("Identify sequences from the selected clade")
    seq_sel_clade, seq_other = split_sequences(taxonomy,args.sel_clade,sequences)

    # First, identify possible conserved regions
    if VERBOSE > 2:
        UTIL_log.print_log("Identify k-mers for the query clade")
    kmers_recall,kmers_precision = find_conserved_regions(seq_sel_clade,args.probe_len,args.perc_seq)

    # Second, check if identified regions are unique, compared to the other
    # clades (~ evaluating precision)
    if VERBOSE > 2:
        UTIL_log.print_log("Check if the identified k-mers are present in the other clades")
    other_sel_clades = check_uniqueness(kmers_precision,seq_other,args.probe_len)

    # Third, prioritize selected probes
    if VERBOSE > 2:
        UTIL_log.print_log("Prioritize selected probes")
    probe_order = priotitize_probes(kmers_recall,kmers_precision,len(seq_sel_clade))

    # print/save to outfile
    if VERBOSE > 2:
        UTIL_log.print_log("Save the result")
    save_result(probe_order, args.outfile,len(seq_sel_clade),kmers_recall,kmers_precision)
Ejemplo n.º 2
0
def save_file(lines, outfile):
    # set temp file
    try:
        if outfile is None:
            temp_file = sys.stdout
        else:
            temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w")
            os.chmod(temp_file.name, 0o644)
    except Exception as e:
        UTIL_log.print_error("couldn't create the file. Message:", exit=False)
        UTIL_log.print_error(str(e))

    # write lines --------------------------------------------------------------
    for l in lines:
        temp_file.write(l)

    # close and move to final destination --------------------------------------
    if not outfile is None:
        try:
            temp_file.flush()
            os.fsync(temp_file.fileno())
            temp_file.close()
            shutil.move(temp_file.name, outfile)
        except Exception as e:
            UTIL_log.print_error("couldn't save the file:", exit=False)
            UTIL_log.print_error(str(e))
def test_probe():
    # HEADER -------------------------------------------------------------------
    sys.stderr.write("\n")
    sys.stderr.write(UTIL_log.colour("Usage: ", "cyan"))
    sys.stderr.write("fish_probes test_probe ")
    sys.stderr.write(UTIL_log.colour("-i ", "blue"))
    sys.stderr.write("<probe_seq> ")

    sys.stderr.write(" [option]\n\n")

    # PARAMETERS ---------------------------------------------------------------
    sys.stderr.write(UTIL_log.colour("   -i  ", "blue"))
    sys.stderr.write("STR     ")
    sys.stderr.write("probe to test")
    sys.stderr.write(UTIL_log.colour("\n", "magenta"))

    sys.stderr.write(UTIL_log.colour("   -sp ", "blue"))
    sys.stderr.write("        ")
    sys.stderr.write("Evaluate also the two half of the probes alone")
    sys.stderr.write(UTIL_log.colour("\n", "magenta"))

    sys.stderr.write(UTIL_log.colour("   -v  ", "blue"))
    sys.stderr.write("INT     ")
    sys.stderr.write(
        "verbose level: 1=error, 2=warning, 3=message, 4+=debugging")
    sys.stderr.write(UTIL_log.colour(" [3]\n", "magenta"))
def main_message(tool_version):
    str_msg = '''
\00
'''
    # HEADER -------------------------------------------------------------------
    str_msg = str_msg + UTIL_log.colour("Program: ", "cyan")
    str_msg = str_msg + "fish_probes - a tool to design FISH probes for 16S sequences\n"

    str_msg = str_msg + UTIL_log.colour("Version: ", "cyan")
    str_msg = str_msg + str(tool_version) + "\n"

    str_msg = str_msg + UTIL_log.colour("Usage: ", "cyan")
    str_msg = str_msg + "fish_probes <command> [options]\n\n"

    str_msg = str_msg + UTIL_log.colour("Command:\n", "cyan")

    # COMMANDS -----------------------------------------------------------------
    str_msg = str_msg + UTIL_log.colour("   design       ", "green_bold")
    str_msg = str_msg + "Identify suitable probes for a given clade\n"

    str_msg = str_msg + UTIL_log.colour("   check_probe  ", "green_bold")
    str_msg = str_msg + "Check physical properties of a probe\n"

    str_msg = str_msg + UTIL_log.colour("   test         ", "green_bold")
    str_msg = str_msg + "Test the tool\n"

    # CONCLUDING ---------------------------------------------------------------
    str_msg = str_msg + "\nType fish_probes <command> to print the help for a specific command"

    return str_msg
Ejemplo n.º 5
0
def load_taxonomy(taxonomy_file):
    try:
        o = open(taxonomy_file, "r")
    except:
        UTIL_log.print_error("Cannot load the taxonomy file", 5)

    # save result to a dict
    result = dict()
    # load file
    for line in o:
        vals = line.rstrip().split("\t")
        result[vals[0]] = vals[1]

    o.close()

    if VERBOSE > 2:
        UTIL_log.print_message("Found taxonomy information for " +
                               str(len(result)) + " sequences.\n")
    return result
def find_conserved_regions(seq_sel_clade,k,perc_seq_with_kmer):
    all_strings_kmers = dict()
    for s in seq_sel_clade:
        allK,allK_N = find_kmers(seq_sel_clade[s],k)
        all_strings_kmers[s] = allK
    # find all possible k-mers
    all_kmers = set()
    for s in all_strings_kmers:
        for kmer in all_strings_kmers[s]:
            all_kmers.add(kmer)

    if VERBOSE > 2:
        UTIL_log.print_message("Identifed "+str(len(all_kmers))+" unique "+str(k)+"-mers.")
    # now we count how many times it appear
    count_mers = dict()
    for kmer in list(all_kmers):
        count_mers[kmer] = 0
    # now we add the counts per k-mer
    for s in all_strings_kmers:
        for kmer in all_strings_kmers[s]:
            count_mers[kmer] = count_mers[kmer] + 1
    # we check which k-mers covers all sequences
    n_seq = len(seq_sel_clade)
    kmers_recall = dict() # this will be filled in by "check_uniqueness"
    kmers_precision = dict()
    list_identical = list()
    for kmer in count_mers:
        if count_mers[kmer] == n_seq:
            list_identical.append(kmer) # used only to print
        if count_mers[kmer] > n_seq*perc_seq_with_kmer:
            kmers_recall[kmer] = count_mers[kmer]
            kmers_precision[kmer] = 0

    if VERBOSE > 2:
        UTIL_log.print_message("  (Identifed "+str(len(list_identical))+" "+str(k)+"-mers present in all sequences)")
        UTIL_log.print_message(str(len(kmers_precision))+" "+str(k)+"-mers will go to the next step.")
        UTIL_log.print_message("(only k-mers present in at least "+str(perc_seq_with_kmer*100)+"% of the sequences will be used).\n")

    if len(kmers_precision) == 0:
        if VERBOSE > 1:
            UTIL_log.print_warning("No k-mers passed the filter. Please decrease the threshold in -p")
    # return
    return kmers_recall,kmers_precision
def check_uniqueness(kmers_precision, seq_other, probe_len):
    n_matching_to_other = 0
    # we check if the kmers are covered by other sequences
    other_sel_clades = dict()
    for s in seq_other:
        this_kmers,this_kmers_N = find_kmers(seq_other[s],probe_len)
        for kmer in this_kmers:
            if kmer in kmers_precision:
                kmers_precision[kmer] = kmers_precision[kmer] + 1
                if not kmer in other_sel_clades:
                    other_sel_clades[kmer] = list()
                other_sel_clades[kmer].append(s)
                n_matching_to_other = n_matching_to_other + 1
        # we need to evaluate the ones with an N or others
        for kmer in this_kmers_N:
            dummy = "TODO"

    if VERBOSE > 2:
        UTIL_log.print_message("The selected probes map to other "+str(n_matching_to_other)+" sequences.\n")
    return other_sel_clades
def split_sequences(taxonomy,sel_clade,sequences):
    seq_sel_clade = dict()
    seq_other = dict()
    for seq in taxonomy:
        # we check if any of the selected clades is present
        to_be_added = False
        for s_c in sel_clade:
            if s_c in taxonomy[seq].split(";"):
                to_be_added = True

        # we add it to the correct
        if to_be_added :
            seq_sel_clade[seq] = sequences[seq]
        else:
            seq_other[seq] = sequences[seq]

    if VERBOSE > 2:
        UTIL_log.print_message("Sequences belonging to the selected clade: "+str(len(seq_sel_clade))+".")
        UTIL_log.print_message("Sequences belonging to other clades: "+str(len(seq_other))+".\n")
    return seq_sel_clade, seq_other
Ejemplo n.º 9
0
def check_input(sequences, taxonomy, args):
    # check that the clade is in the taxonomy
    found_clade = False
    for seq in taxonomy:
        for clade in args.sel_clade:
            if clade in taxonomy[seq].split(";"):
                found_clade = True
    if not found_clade:
        # none of the clades in the list is present
        UTIL_log.print_error("Selected clade(s) not present in the taxonomy")

    # check that we have a taxonomy annotation for each sequence
    # NOTE: it can be that there are more taxonomy entries than sequences, but
    # not the contrary
    for seq in sequences:
        if not seq in taxonomy:
            UTIL_log.print_error("Sequence '" + seq +
                                 "' does not have a taxonomy")

    # Remove entries from the taxonomy, if there are no corresponding sequences
    to_remove = list()
    for seq in taxonomy:
        if not seq in sequences:
            to_remove.append(seq)
    for r in to_remove:
        del taxonomy[r]
    if len(to_remove) > 0:
        if VERBOSE > 2:
            UTIL_log.print_message(
                "Removed " + str(len(to_remove)) +
                " taxonomy line(s) because no sequence was present.\n")
Ejemplo n.º 10
0
def load_sequences(sequences_file):
    try:
        o = open(sequences_file, "r")
    except:
        UTIL_log.print_error("Cannot load the fasta file with the sequences")

    # save result to a dict
    result = dict()
    # load file assuming is a fasta file
    first_line = o.readline()
    if not (first_line.startswith(">")):
        UTIL_log.print_error("Not a fasta file", 4)
    else:
        header = first_line.rstrip()[1:]
        temp_sequence = ""
    # go through each line
    for line in o:
        if line.startswith(">"):
            result[header] = temp_sequence
            header = line.rstrip()[1:]
            temp_sequence = ""
        else:
            temp_sequence = temp_sequence + line.rstrip()

    # we write the last line
    result[header] = temp_sequence

    o.close()

    if VERBOSE > 2:
        UTIL_log.print_message("Found " + str(len(result)) + " sequences.\n")
    return result
Ejemplo n.º 11
0
def main():
    # load sys.argv
    args = UTIL_arg_parser.input_parser(tool_version)

    if args.verbose > 2:
        UTIL_log.print_message("Call: ")
        UTIL_log.print_message(" ".join(sys.argv)+"\n")

    # find probes --------------------------------------------------------------
    if args.command == "design":
        sequences,taxonomy = C_load_input.load_and_check_input(args)
        C_predict_probes.predict_probes(sequences,taxonomy,args)

    # test a given probe -------------------------------------------------------
    if args.command == "check_probe":
        print(UTIL_probe.create_to_print(args.input,header = True,split = args.split_probe))

    # test the tool ------------------------------------------------------------
    if args.command == "test":
        C_test.test()

    sys.exit(0)
Ejemplo n.º 12
0
def priotitize_probes(kmers_recall,kmers_precision,n_seq_clade):
    if VERBOSE > 2:
        UTIL_log.print_message("We order the probes by the number of wrong clades.")

    # find the order
    probe_order = list()
    missing = list()
    # best one: cover all sequences, and not in the other clade
    for p in list(kmers_recall.keys()):
        if kmers_recall[p]/n_seq_clade == 1:
            if kmers_precision[p] == 0:
                probe_order.append(p)
            else:
                missing.append(p)
        else:
            missing.append(p)

    if VERBOSE > 2:
        UTIL_log.print_message(str(len(probe_order))+" probe(s) present in all the selected clade(s) and have no contamination.\n")
    probe_order.extend(missing)

    return probe_order
Ejemplo n.º 13
0
def load_and_check_input(args):
    # set verbose
    global VERBOSE
    VERBOSE = args.verbose

    # load data from files
    if VERBOSE > 2:
        UTIL_log.print_log("Load sequences")
    sequences = load_sequences(args.sequences)
    if VERBOSE > 2:
        UTIL_log.print_log("Load taxonomy")
    taxonomy = load_taxonomy(args.taxonomy)

    # check that the input is correct
    if VERBOSE > 2:
        UTIL_log.print_log("Check input files")
    check_input(sequences, taxonomy, args)

    return sequences, taxonomy
Ejemplo n.º 14
0
def input_parser(tool_version):
    parser = argparse.ArgumentParser(
        usage=UTIL_print_menus.main_message(tool_version),
        formatter_class=CapitalisedHelpFormatter,
        add_help=False)

    # COMMAND
    parser.add_argument('command',
                        action="store",
                        default=None,
                        help='command',
                        choices=['design', 'check_probe', 'test'])

    # File with the sequences in fasta format
    parser.add_argument('-s',
                        dest='sequences',
                        action="store",
                        default=None,
                        help='File containing the sequences')

    # File with the taxonomy, the taxonomy is separated by ";", and there is a
    # "\t" between name and tax (Example: "gene1\tclade1;clade2;clade3\n")
    parser.add_argument('-t',
                        dest='taxonomy',
                        action="store",
                        default=None,
                        help='File containing the taxonomy')

    # Clade for which we have to find the primers
    parser.add_argument('-c',
                        dest='sel_clade',
                        action="store",
                        default=None,
                        help='Clade selected to design the primers',
                        nargs="+")

    # Verbose level
    parser.add_argument('-v', action='store', type=int, default=3,
                        dest='verbose', help='Verbose level: 1=error,'\
                        ' 2=warning, 3=message, 4+=debugging [3]')

    # Length of the probes that we identify
    parser.add_argument('-k',
                        action='store',
                        type=int,
                        default=20,
                        dest='probe_len',
                        help='Probe length [20]')

    # Output file
    parser.add_argument('-o',
                        action='store',
                        default=None,
                        dest='outfile',
                        help='Output file [stdout]')

    # General input file
    parser.add_argument('-i',
                        action='store',
                        default=None,
                        dest='input',
                        help='General input')

    # Min fraction of sequences from the selected clade
    parser.add_argument(
        '-p',
        action='store',
        default=0.9,
        type=float,
        dest='perc_seq',
        help=
        'Minimum fraction of sequences that should contain the selected probe [0.9]'
    )

    # help
    parser.add_argument('-h',
                        '--help',
                        action='store_true',
                        default=False,
                        dest='help',
                        help='Print help')

    # help
    parser.add_argument('-sp',
                        '--split',
                        action='store_true',
                        default=False,
                        dest='split_probe',
                        help='Evaluate also a probe split in two')

    # version
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0} on python {1}'.format(
                            tool_version,
                            sys.version.split()[0]))

    args = parser.parse_args()

    ############################################################################
    # CHECK ARGUMENTS
    # check args verbose
    if args.verbose < 1:
        UTIL_log.print_error("Verbose (-v) needs to be higher than 0")
    # check args perc seq
    if args.perc_seq < 0 or args.perc_seq > 1:
        UTIL_log.print_error("Threshold (-p) should be between 0 and 1")
    # check length of the probe
    if args.probe_len < 1:
        UTIL_log.print_error("Probe length (-k) cannot be lower than 0")

    ############################################################################
    # CHECK ARGUMENTS FOR COMMAND DESIGN
    if args.command == "design":
        # print help
        if args.help:
            UTIL_print_menus.design()
            sys.exit(0)
        # there are three mandatory input
        if args.sequences is None:
            UTIL_print_menus.design()
            UTIL_log.print_error("Missing -s.")
        if args.taxonomy is None:
            UTIL_print_menus.design()
            UTIL_log.print_error("Missing -t.")
        if args.sel_clade is None:
            UTIL_print_menus.design()
            UTIL_log.print_error("Missing -c.")

    ############################################################################
    # CHECK ARGUMENTS FOR COMMAND TEST_PROBE
    if args.command == "check_probe":
        # print help
        if args.help:
            UTIL_print_menus.test_probe()
            sys.exit(0)
        # there is only one mandatory input
        if args.input is None:
            UTIL_print_menus.test_probe()
            UTIL_log.print_error("Missing -i.")

    return args
Ejemplo n.º 15
0
def test():
    UTIL_log.print_log("Prepare test")
    UTIL_log.print_message("Run test on /fish_probes/test/.")

    # prepare command ----------------------------------------------------------
    to_run = "fish_probes design -c Firmicutes -k 7 -v1"
    to_run = to_run + " -s " + os.path.join(TEST_DATA_PATH, "seq.fa")
    to_run = to_run + " -t " + os.path.join(TEST_DATA_PATH, "tax")

    UTIL_log.print_message("Command:")
    UTIL_log.print_message(to_run + "\n")

    # we run the command -------------------------------------------------------
    UTIL_log.print_log("Run test")
    try:
        from subprocess import DEVNULL
    except ImportError:
        DEVNULL = open(os.devnull, 'wb')

    popenCMD = shlex.split(to_run)
    cmd = subprocess.Popen(popenCMD, stdout=subprocess.PIPE, stderr=DEVNULL)

    # save the result
    result = list()
    for line in cmd.stdout:
        result.append(line.decode('ascii'))

    # check exit status
    cmd.stdout.close()
    return_code = cmd.wait()
    if return_code:
        UTIL_log.print_error("Tool failed")
    else:
        UTIL_log.print_message("Command completed correctly.\n")

    # we check the result ------------------------------------------------------
    UTIL_log.print_log("Check result")

    # check that there is a header
    if not result[0].startswith("probe"):
        UTIL_log.print_error("Header incorrect:", exit=False)
        UTIL_log.print_error(result[0])

    # check that the probe with the highest priority is correct
    if not result[1].startswith("CTCGATT"):
        UTIL_log.print_error("Predicted probe not correct", exit=False)
        UTIL_log.print_error(result[1])

    UTIL_log.print_message("Result is correct.\n")
Ejemplo n.º 16
0
def design():
    # HEADER -------------------------------------------------------------------
    sys.stderr.write("\n")
    sys.stderr.write(UTIL_log.colour("Usage: ", "cyan"))
    sys.stderr.write("fish_probes design ")
    sys.stderr.write(UTIL_log.colour("-s ", "blue"))
    sys.stderr.write("<seq> ")
    sys.stderr.write(UTIL_log.colour("-t ", "blue"))
    sys.stderr.write("<tax> ")
    sys.stderr.write(UTIL_log.colour("-c ", "blue"))
    sys.stderr.write("<clade>")

    sys.stderr.write(" [option]\n\n")

    # PARAMETERS ---------------------------------------------------------------
    sys.stderr.write(UTIL_log.colour("   -s  ", "blue"))
    sys.stderr.write("FILE    ")
    sys.stderr.write("fasta file with the 16S sequences")
    sys.stderr.write(UTIL_log.colour("\n", "magenta"))

    sys.stderr.write(UTIL_log.colour("   -t  ", "blue"))
    sys.stderr.write("FILE    ")
    sys.stderr.write(
        "file with the taxonomy for the 16S sequences provided in -s")
    sys.stderr.write(UTIL_log.colour("\n", "magenta"))

    sys.stderr.write(UTIL_log.colour("   -c  ", "blue"))
    sys.stderr.write("STR     ")
    sys.stderr.write(
        "clade for which we need to find the probe (if more than one, separate with spaces)"
    )
    sys.stderr.write(UTIL_log.colour("\n", "magenta"))

    sys.stderr.write(UTIL_log.colour("   -k  ", "blue"))
    sys.stderr.write("INT     ")
    sys.stderr.write("length of the probe")
    sys.stderr.write(UTIL_log.colour(" [20]\n", "magenta"))

    sys.stderr.write(UTIL_log.colour("   -o  ", "blue"))
    sys.stderr.write("FILE    ")
    sys.stderr.write("output file name")
    sys.stderr.write(UTIL_log.colour(" [stdout]\n", "magenta"))

    sys.stderr.write(UTIL_log.colour("   -p  ", "blue"))
    sys.stderr.write("FLOAT   ")
    sys.stderr.write(
        "minimum fraction of sequences that should contain the selected probe")
    sys.stderr.write(UTIL_log.colour(" [0.9]\n", "magenta"))

    sys.stderr.write(UTIL_log.colour("   -v  ", "blue"))
    sys.stderr.write("INT     ")
    sys.stderr.write(
        "verbose level: 1=error, 2=warning, 3=message, 4+=debugging")
    sys.stderr.write(UTIL_log.colour(" [3]\n", "magenta"))