Example #1
0
def run_prodigal(genome):
    if not is_tool("prodigal"):
        raise ValueError("[E::align] Error: prodigal is not in the path.\n")

    # we need two files, one for the proteins and one for the genes
    genes = tempfile.NamedTemporaryFile(delete=False, mode="w")
    proteins = tempfile.NamedTemporaryFile(delete=False, mode="w")
    # prodigal command
    prodigal_cmd = "prodigal -i {genome} -d {gene_file} -a {protein_file}".format(
        genome=genome, gene_file=genes.name, protein_file=proteins.name)
    cmd = shlex.split(prodigal_cmd)
    parse_cmd = subprocess.Popen(cmd, stdout=DEVNULL, stderr=subprocess.PIPE)
    # we save stderr if necessary
    all_stderr = ""
    for line in parse_cmd.stderr:
        line = line.decode('ascii')
        all_stderr = all_stderr + line
    return_code = parse_cmd.wait()
    if return_code:
        raise ValueError(f"[E::align] Error. prodigal failed\n\n{all_stderr}")

    # we re-name the header of the fasta files ---------------------------------
    # we expect to have the same number of genes and proteins, and also that the
    def copy_fasta(fasta_file, seqid, is_binary=True, head_start=0):
        with tempfile.NamedTemporaryFile(
                delete=False,
                mode="w") as fasta_out, open(fasta_file) as fasta_in:
            for index, (sid, seq) in enumerate(read_fasta(fasta_in,
                                                          is_binary=is_binary),
                                               start=1):
                print(">{seqid}_{index}".format(**locals()),
                      seq,
                      sep="\n",
                      file=fasta_out)
            fasta_out.flush()
            os.fsync(fasta_out.fileno())
            return fasta_out.name, index

    parsed_genes, gene_count = copy_fasta(genes.name, genome, is_binary=False)
    parsed_proteins, protein_count = copy_fasta(proteins.name,
                                                genome,
                                                is_binary=False)

    os.remove(genes.name)
    os.remove(proteins.name)

    return parsed_genes, parsed_proteins
Example #2
0
def rev_complement(seq_file, verbose):
    # we use seqtk to reverse complement
    if not is_tool("seqtk"):
        sys.stderr.write("[E::align] Error: seqtk is not in the path. Please install seqtk.\n")
        sys.exit(1)
    # temp file
    if verbose > 2: sys.stderr.write("Create file with reverse complement...")
    rev_file = tempfile.NamedTemporaryFile(delete=False, mode="w")
    cmd = "seqtk seq -r "+seq_file
    if verbose > 4:
        sys.stderr.write("\nCommand used to reverse complement: "+cmd+" > "+rev_file.name+"\n")
    CMD = shlex.split(cmd)

    parse_cmd = subprocess.Popen(CMD,stdout=rev_file,)
    rev_file.flush()
    os.fsync(rev_file.fileno())
    rev_file.close()
    return_code = parse_cmd.wait()
    if return_code:
        sys.stderr.write("\n[E::align] Error. seqtk failed\n")
        sys.exit(1)
    if verbose > 2: sys.stderr.write("done\n")
    return rev_file.name
def check_tool(seq_file, hmm_file, use_cmalign):
    if use_cmalign:
        sys.stderr.write("Check that 'cmalign' is in the path...................")
        if not is_tool("cmalign"):
            sys.stderr.write(f"\n{bcolors.FAIL}{bcolors.BOLD}{bcolors.UNDERLINE} ERROR:{bcolors.ENDC} ")
            sys.stderr.write("cmalign is not in the path. Please install Infernal.\n")
            return True
    else:
        sys.stderr.write("Check that 'hmmalign' is in the path..................")
        if not is_tool("hmmalign"):
            sys.stderr.write(f"\n{bcolors.FAIL}{bcolors.BOLD}{bcolors.UNDERLINE} ERROR:{bcolors.ENDC} ")
            sys.stderr.write("hmmalign is not in the path. Please install HMMER3.\n")
            return True
    # if we arrive here, then the tool is in the path:
    sys.stderr.write(f"{bcolors.OKGREEN}{bcolors.BOLD}{bcolors.UNDERLINE}correct{bcolors.ENDC}\n")

    # check esl-reformat
    sys.stderr.write("Check that 'esl-reformat' is in the path..............")
    if not is_tool("esl-reformat"):
        sys.stderr.write(f"\n{bcolors.FAIL}{bcolors.BOLD}{bcolors.UNDERLINE} ERROR:{bcolors.ENDC} ")
        sys.stderr.write("esl-reformat is not in the path. Please install Easel.\n")
        return True
    # if we arrive here, then the tool is in the path:
    sys.stderr.write(f"{bcolors.OKGREEN}{bcolors.BOLD}{bcolors.UNDERLINE}correct{bcolors.ENDC}\n")



    # check that the file is correct -------------------------------------------
    # we create a temporary file with the first tree fasta sequences:
    sys.stderr.write("Try to run alignment tool.............................")
    sys.stderr.flush()
    temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w")
    os.chmod(temp_file.name, 0o644)

    o = open(seq_file,"r")
    count = 0
    for line in o:
        if line.startswith(">"):
            count = count + 1
        if count < 4:
            temp_file.write(line)

    try:
        temp_file.flush()
        os.fsync(temp_file.fileno())
        temp_file.close()
    except:
        if verbose>4:
            sys.stderr.write(f"\n{bcolors.FAIL}{bcolors.BOLD}{bcolors.UNDERLINE} ERROR:{bcolors.ENDC} ")
            sys.stderr.write("Error when saving the temp file\n")
        sys.exit(1)

    # we create the command to call
    cmd = "hmmalign "
    if use_cmalign:
        cmd = "cmalign "

    cmd = cmd + hmm_file +" "+ temp_file.name

    # we call the command
    CMD = shlex.split(cmd)
    align_cmd = subprocess.Popen(CMD,stdout=subprocess.PIPE,)

    # parse the alignment
    cmd2 = "esl-reformat a2m -"
    CMD2 = shlex.split(cmd2)
    parse_cmd = subprocess.Popen(CMD2,stdin=align_cmd.stdout,stdout=subprocess.PIPE,)

    all_lines = list()
    for line in linearise_fasta(parse_cmd.stdout, head_start=1):
        all_lines.append(line)

    align_cmd.stdout.close()
    return_code = align_cmd.wait()
    if return_code:
        os.remove(temp_file.name)
        return True
    # check that converting the file worked correctly
    parse_cmd.stdout.close()
    return_code = parse_cmd.wait()
    if return_code:
        os.remove(temp_file.name)
        return True

    # remove temporary file
    os.remove(temp_file.name)
    # if we arrive here, then the tool is in the path:
    sys.stderr.write(f"{bcolors.OKGREEN}{bcolors.BOLD}{bcolors.UNDERLINE}correct{bcolors.ENDC}\n")




    # check alignment quality --------------------------------------------------
    sys.stderr.write("\nCheck alignment quality:\n")

    # number of internal HMM states
    n_internal_states = 0
    for i in all_lines[0].split("\t")[1]:
        # gap (deletions) are counted
        if i == "-":
            n_internal_states = n_internal_states + 1
        else:
            # and capital letters
            if i.isupper():
                n_internal_states = n_internal_states + 1
    sys.stderr.write(" Internal states: "+str(n_internal_states)+"\n")

    count = 0
    for al in all_lines:
        count = count + 1
        sys.stderr.write("\n Sequence "+str(count)+":\n")
        # count occurences
        mat_i_s = 0 # internal states that match (even mismatch is counted I guess), they are upper case letters
        deletions = 0 # number of deletions (they are "-")
        insetions = 0 # insertions are lower case letters
        for i in all_lines[count-1].split("\t")[1]:
            if i == "-":
                deletions = deletions + 1
            else:
                if i.isupper():
                    mat_i_s = mat_i_s + 1
                if i.islower():
                    insetions = insetions + 1
        # print
        sys.stderr.write("   Internal states matches: "+str(mat_i_s)+" ("+str(round(mat_i_s/n_internal_states * 100))+"%)\n")
        sys.stderr.write("   Deletions: "+str(deletions)+" ("+str(round(deletions/n_internal_states * 100))+"%)\n")
        sys.stderr.write("   Insertions: "+str(insetions)+"\n")
Example #4
0
def calc_al(fasta_file, hmm_file, use_cmalign, n_threads, verbose):
    # check that the tools are available
    if use_cmalign:
        if not is_tool("cmalign"):
            sys.stderr.write("[E::align] Error: cmalign is not in the path. Please install Infernal.\n")
            sys.exit(1)
    else:
        if not is_tool("hmmalign"):
            sys.stderr.write("[E::align] Error: hmmalign is not in the path. Please install HMMER3.\n")
            sys.exit(1)

    if not is_tool("esl-reformat"):
        sys.stderr.write("[E::align] Error: esl-reformat is not in the path. Please install Easel.\n")
        sys.exit(1)

    # prepare the command to run
    cmd = "hmmalign "
    if use_cmalign:
        cmd = "cmalign --cpu "+str(n_threads)+" "

    # add the file
    cmd = cmd + hmm_file +" "+ fasta_file

    if verbose > 4:
        sys.stderr.write("Command used to align the sequences: "+cmd+"\n")

    # we call the command
    CMD = shlex.split(cmd)
    align_cmd = subprocess.Popen(CMD,stdout=subprocess.PIPE,)

    # parse the alignment
    cmd2 = "esl-reformat a2m -"
    CMD2 = shlex.split(cmd2)
    parse_cmd = subprocess.Popen(CMD2,stdin=align_cmd.stdout,stdout=subprocess.PIPE,)

    all_lines = dict()
    for line in linearise_fasta(parse_cmd.stdout, head_start=0):
        id = line.split("\t")[0]
        # calculate the number of internal state covered
        mat_i_s = 0 # internal states that match (even mismatch is counted I guess), they are upper case letters
        deletions = 0 # number of deletions (they are "-")
        insertions = 0 # insertions are lower case letters
        for i in line.split("\t")[1]:
            if i == "-":
                deletions = deletions + 1
            else:
                if i.isupper():
                    mat_i_s = mat_i_s + 1
                if i.islower():
                    insertions = insertions + 1

        all_lines[id] = ( mat_i_s/(mat_i_s+deletions) ) * 100

    # check that hmmalign/cmalign finished correctly
    align_cmd.stdout.close()
    return_code = align_cmd.wait()
    if return_code:
        sys.stderr.write("[E::align] Error. hmmalign/cmalign failed\n")
        sys.exit(1)
    # check that converting the file worked correctly
    parse_cmd.stdout.close()
    return_code = parse_cmd.wait()
    if return_code:
        sys.stderr.write("[E::align] Error. esl-reformat failed\n")
        sys.exit(1)

    return all_lines
Example #5
0
def align_generator(seq_file, protein_file, hmm_file, use_cmalign, n_threads, verbose, return_numpy, min_perc_state):
    """Align sequences and transform them into 1-hot encoding, ready for
       classification.
    Parameters
    ----------
     seq_file:     file with the nucleotide sequences [string]
     protein_file:  file with the protein sequences [string or None]
     hmm_file:     file with the hmm model [string]
     use_cmalign:  if True, we use cmalign. If false, we use hmmalign [bool]
     n_threads:    number of threads to use for cmalign (hmmalign can run only
                   on one thread) [string/int]
     verbose:      how much info to print [int]
     return_numpy: True if you want to return a numpy array instead of a string
    Returns
    -------
     Returns a generator with:
     (fasta_id, aligned_sequence) tuples
    """

    # number of sequences that pass and sont pass the filter
    n_pass, n_not_pass = 0, 0
    # check that the tools are available
    if use_cmalign and not is_tool("cmalign"):
        raise ValueError("[E::align] Error: cmalign is not in the path. Please install Infernal.")
    elif not is_tool("hmmalign"):
        raise ValueError("[E::align] Error: hmmalign is not in the path. Please install HMMER3.")
    if not is_tool("esl-reformat"):
        raise ValueError("[E::align] Error: esl-reformat is not in the path. Please install Easel.")

    aligner = f"cmalign --cpu {n_threads}" if use_cmalign else "hmmalign"
    seq_input = protein_file if protein_file else seq_file
    align_cmd = f"{aligner} {hmm_file} {seq_input}"

    if verbose > 4:
        print(f"Command used to align the sequences: {align_cmd}", file=sys.stderr)

    # run the command
    CMD = shlex.split(align_cmd)
    align_cmd = subprocess.Popen(CMD, stdout=subprocess.PIPE,)

    # command to parse the alignment from STOCKHOLM to fasta format
    cmd2 = "esl-reformat a2m -"
    CMD2 = shlex.split(cmd2)
    parse_cmd = subprocess.Popen(CMD2, stdin=align_cmd.stdout, stdout=subprocess.PIPE,)

    if protein_file:
        seq_stream = zip(read_fasta(parse_cmd.stdout, head_start=1),
                         read_fasta(open(seq_file), is_binary=False, head_start=1))
    else:
        seq_stream = read_fasta(parse_cmd.stdout, head_start=1)

    for item in seq_stream:
        if protein_file:
            (pid, pseq), (gid, gseq) = item
            if pid != gid:
                sys.stderr.write("[E::align] Error. protein and gene identifiers {} {} don't match.".format(pid, gid))
                sys.exit(1)
            gseq = protein2gene_alignment(gid, pseq, gseq, check_length=True)
        else:
            gid, gseq = item

        converted_ali, perc_aligned_characters = convert_alignment(gseq, verbose, as_numpy=return_numpy)
        if perc_aligned_characters >= min_perc_state:
            n_pass += 1
            yield gid, converted_ali
        else:
            n_not_pass += 1

    # check that hmmalign/cmalign finished correctly
    align_cmd.stdout.close()
    return_code = align_cmd.wait()
    if return_code:
        raise ValueError("[E::align] Error. hmmalign/cmalign failed.")
    # check that converting the file worked correctly
    parse_cmd.stdout.close()
    return_code = parse_cmd.wait()
    if return_code:
        raise ValueError("[E::align] Error. esl-reformat failed.")

    # print the number of sequences that were filtered
    if verbose > 3:
        print(f" Number of sequences that pass the filter: {n_pass}", file=sys.stderr)
        print(f" Number of sequences that do not pass the filter: {n_not_pass}", file=sys.stderr)
Example #6
0
 def __init__(self):
     if not is_tool("prodigal"):                                                     
         raise ValueError("[E::align] Error: prodigal is not in the path.\n")