def run_prodigal(genome): if not is_tool("prodigal"): raise ValueError("[E::align] Error: prodigal is not in the path.\n") # we need two files, one for the proteins and one for the genes genes = tempfile.NamedTemporaryFile(delete=False, mode="w") proteins = tempfile.NamedTemporaryFile(delete=False, mode="w") # prodigal command prodigal_cmd = "prodigal -i {genome} -d {gene_file} -a {protein_file}".format( genome=genome, gene_file=genes.name, protein_file=proteins.name) cmd = shlex.split(prodigal_cmd) parse_cmd = subprocess.Popen(cmd, stdout=DEVNULL, stderr=subprocess.PIPE) # we save stderr if necessary all_stderr = "" for line in parse_cmd.stderr: line = line.decode('ascii') all_stderr = all_stderr + line return_code = parse_cmd.wait() if return_code: raise ValueError(f"[E::align] Error. prodigal failed\n\n{all_stderr}") # we re-name the header of the fasta files --------------------------------- # we expect to have the same number of genes and proteins, and also that the def copy_fasta(fasta_file, seqid, is_binary=True, head_start=0): with tempfile.NamedTemporaryFile( delete=False, mode="w") as fasta_out, open(fasta_file) as fasta_in: for index, (sid, seq) in enumerate(read_fasta(fasta_in, is_binary=is_binary), start=1): print(">{seqid}_{index}".format(**locals()), seq, sep="\n", file=fasta_out) fasta_out.flush() os.fsync(fasta_out.fileno()) return fasta_out.name, index parsed_genes, gene_count = copy_fasta(genes.name, genome, is_binary=False) parsed_proteins, protein_count = copy_fasta(proteins.name, genome, is_binary=False) os.remove(genes.name) os.remove(proteins.name) return parsed_genes, parsed_proteins
def rev_complement(seq_file, verbose): # we use seqtk to reverse complement if not is_tool("seqtk"): sys.stderr.write("[E::align] Error: seqtk is not in the path. Please install seqtk.\n") sys.exit(1) # temp file if verbose > 2: sys.stderr.write("Create file with reverse complement...") rev_file = tempfile.NamedTemporaryFile(delete=False, mode="w") cmd = "seqtk seq -r "+seq_file if verbose > 4: sys.stderr.write("\nCommand used to reverse complement: "+cmd+" > "+rev_file.name+"\n") CMD = shlex.split(cmd) parse_cmd = subprocess.Popen(CMD,stdout=rev_file,) rev_file.flush() os.fsync(rev_file.fileno()) rev_file.close() return_code = parse_cmd.wait() if return_code: sys.stderr.write("\n[E::align] Error. seqtk failed\n") sys.exit(1) if verbose > 2: sys.stderr.write("done\n") return rev_file.name
def check_tool(seq_file, hmm_file, use_cmalign): if use_cmalign: sys.stderr.write("Check that 'cmalign' is in the path...................") if not is_tool("cmalign"): sys.stderr.write(f"\n{bcolors.FAIL}{bcolors.BOLD}{bcolors.UNDERLINE} ERROR:{bcolors.ENDC} ") sys.stderr.write("cmalign is not in the path. Please install Infernal.\n") return True else: sys.stderr.write("Check that 'hmmalign' is in the path..................") if not is_tool("hmmalign"): sys.stderr.write(f"\n{bcolors.FAIL}{bcolors.BOLD}{bcolors.UNDERLINE} ERROR:{bcolors.ENDC} ") sys.stderr.write("hmmalign is not in the path. Please install HMMER3.\n") return True # if we arrive here, then the tool is in the path: sys.stderr.write(f"{bcolors.OKGREEN}{bcolors.BOLD}{bcolors.UNDERLINE}correct{bcolors.ENDC}\n") # check esl-reformat sys.stderr.write("Check that 'esl-reformat' is in the path..............") if not is_tool("esl-reformat"): sys.stderr.write(f"\n{bcolors.FAIL}{bcolors.BOLD}{bcolors.UNDERLINE} ERROR:{bcolors.ENDC} ") sys.stderr.write("esl-reformat is not in the path. Please install Easel.\n") return True # if we arrive here, then the tool is in the path: sys.stderr.write(f"{bcolors.OKGREEN}{bcolors.BOLD}{bcolors.UNDERLINE}correct{bcolors.ENDC}\n") # check that the file is correct ------------------------------------------- # we create a temporary file with the first tree fasta sequences: sys.stderr.write("Try to run alignment tool.............................") sys.stderr.flush() temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w") os.chmod(temp_file.name, 0o644) o = open(seq_file,"r") count = 0 for line in o: if line.startswith(">"): count = count + 1 if count < 4: temp_file.write(line) try: temp_file.flush() os.fsync(temp_file.fileno()) temp_file.close() except: if verbose>4: sys.stderr.write(f"\n{bcolors.FAIL}{bcolors.BOLD}{bcolors.UNDERLINE} ERROR:{bcolors.ENDC} ") sys.stderr.write("Error when saving the temp file\n") sys.exit(1) # we create the command to call cmd = "hmmalign " if use_cmalign: cmd = "cmalign " cmd = cmd + hmm_file +" "+ temp_file.name # we call the command CMD = shlex.split(cmd) align_cmd = subprocess.Popen(CMD,stdout=subprocess.PIPE,) # parse the alignment cmd2 = "esl-reformat a2m -" CMD2 = shlex.split(cmd2) parse_cmd = subprocess.Popen(CMD2,stdin=align_cmd.stdout,stdout=subprocess.PIPE,) all_lines = list() for line in linearise_fasta(parse_cmd.stdout, head_start=1): all_lines.append(line) align_cmd.stdout.close() return_code = align_cmd.wait() if return_code: os.remove(temp_file.name) return True # check that converting the file worked correctly parse_cmd.stdout.close() return_code = parse_cmd.wait() if return_code: os.remove(temp_file.name) return True # remove temporary file os.remove(temp_file.name) # if we arrive here, then the tool is in the path: sys.stderr.write(f"{bcolors.OKGREEN}{bcolors.BOLD}{bcolors.UNDERLINE}correct{bcolors.ENDC}\n") # check alignment quality -------------------------------------------------- sys.stderr.write("\nCheck alignment quality:\n") # number of internal HMM states n_internal_states = 0 for i in all_lines[0].split("\t")[1]: # gap (deletions) are counted if i == "-": n_internal_states = n_internal_states + 1 else: # and capital letters if i.isupper(): n_internal_states = n_internal_states + 1 sys.stderr.write(" Internal states: "+str(n_internal_states)+"\n") count = 0 for al in all_lines: count = count + 1 sys.stderr.write("\n Sequence "+str(count)+":\n") # count occurences mat_i_s = 0 # internal states that match (even mismatch is counted I guess), they are upper case letters deletions = 0 # number of deletions (they are "-") insetions = 0 # insertions are lower case letters for i in all_lines[count-1].split("\t")[1]: if i == "-": deletions = deletions + 1 else: if i.isupper(): mat_i_s = mat_i_s + 1 if i.islower(): insetions = insetions + 1 # print sys.stderr.write(" Internal states matches: "+str(mat_i_s)+" ("+str(round(mat_i_s/n_internal_states * 100))+"%)\n") sys.stderr.write(" Deletions: "+str(deletions)+" ("+str(round(deletions/n_internal_states * 100))+"%)\n") sys.stderr.write(" Insertions: "+str(insetions)+"\n")
def calc_al(fasta_file, hmm_file, use_cmalign, n_threads, verbose): # check that the tools are available if use_cmalign: if not is_tool("cmalign"): sys.stderr.write("[E::align] Error: cmalign is not in the path. Please install Infernal.\n") sys.exit(1) else: if not is_tool("hmmalign"): sys.stderr.write("[E::align] Error: hmmalign is not in the path. Please install HMMER3.\n") sys.exit(1) if not is_tool("esl-reformat"): sys.stderr.write("[E::align] Error: esl-reformat is not in the path. Please install Easel.\n") sys.exit(1) # prepare the command to run cmd = "hmmalign " if use_cmalign: cmd = "cmalign --cpu "+str(n_threads)+" " # add the file cmd = cmd + hmm_file +" "+ fasta_file if verbose > 4: sys.stderr.write("Command used to align the sequences: "+cmd+"\n") # we call the command CMD = shlex.split(cmd) align_cmd = subprocess.Popen(CMD,stdout=subprocess.PIPE,) # parse the alignment cmd2 = "esl-reformat a2m -" CMD2 = shlex.split(cmd2) parse_cmd = subprocess.Popen(CMD2,stdin=align_cmd.stdout,stdout=subprocess.PIPE,) all_lines = dict() for line in linearise_fasta(parse_cmd.stdout, head_start=0): id = line.split("\t")[0] # calculate the number of internal state covered mat_i_s = 0 # internal states that match (even mismatch is counted I guess), they are upper case letters deletions = 0 # number of deletions (they are "-") insertions = 0 # insertions are lower case letters for i in line.split("\t")[1]: if i == "-": deletions = deletions + 1 else: if i.isupper(): mat_i_s = mat_i_s + 1 if i.islower(): insertions = insertions + 1 all_lines[id] = ( mat_i_s/(mat_i_s+deletions) ) * 100 # check that hmmalign/cmalign finished correctly align_cmd.stdout.close() return_code = align_cmd.wait() if return_code: sys.stderr.write("[E::align] Error. hmmalign/cmalign failed\n") sys.exit(1) # check that converting the file worked correctly parse_cmd.stdout.close() return_code = parse_cmd.wait() if return_code: sys.stderr.write("[E::align] Error. esl-reformat failed\n") sys.exit(1) return all_lines
def align_generator(seq_file, protein_file, hmm_file, use_cmalign, n_threads, verbose, return_numpy, min_perc_state): """Align sequences and transform them into 1-hot encoding, ready for classification. Parameters ---------- seq_file: file with the nucleotide sequences [string] protein_file: file with the protein sequences [string or None] hmm_file: file with the hmm model [string] use_cmalign: if True, we use cmalign. If false, we use hmmalign [bool] n_threads: number of threads to use for cmalign (hmmalign can run only on one thread) [string/int] verbose: how much info to print [int] return_numpy: True if you want to return a numpy array instead of a string Returns ------- Returns a generator with: (fasta_id, aligned_sequence) tuples """ # number of sequences that pass and sont pass the filter n_pass, n_not_pass = 0, 0 # check that the tools are available if use_cmalign and not is_tool("cmalign"): raise ValueError("[E::align] Error: cmalign is not in the path. Please install Infernal.") elif not is_tool("hmmalign"): raise ValueError("[E::align] Error: hmmalign is not in the path. Please install HMMER3.") if not is_tool("esl-reformat"): raise ValueError("[E::align] Error: esl-reformat is not in the path. Please install Easel.") aligner = f"cmalign --cpu {n_threads}" if use_cmalign else "hmmalign" seq_input = protein_file if protein_file else seq_file align_cmd = f"{aligner} {hmm_file} {seq_input}" if verbose > 4: print(f"Command used to align the sequences: {align_cmd}", file=sys.stderr) # run the command CMD = shlex.split(align_cmd) align_cmd = subprocess.Popen(CMD, stdout=subprocess.PIPE,) # command to parse the alignment from STOCKHOLM to fasta format cmd2 = "esl-reformat a2m -" CMD2 = shlex.split(cmd2) parse_cmd = subprocess.Popen(CMD2, stdin=align_cmd.stdout, stdout=subprocess.PIPE,) if protein_file: seq_stream = zip(read_fasta(parse_cmd.stdout, head_start=1), read_fasta(open(seq_file), is_binary=False, head_start=1)) else: seq_stream = read_fasta(parse_cmd.stdout, head_start=1) for item in seq_stream: if protein_file: (pid, pseq), (gid, gseq) = item if pid != gid: sys.stderr.write("[E::align] Error. protein and gene identifiers {} {} don't match.".format(pid, gid)) sys.exit(1) gseq = protein2gene_alignment(gid, pseq, gseq, check_length=True) else: gid, gseq = item converted_ali, perc_aligned_characters = convert_alignment(gseq, verbose, as_numpy=return_numpy) if perc_aligned_characters >= min_perc_state: n_pass += 1 yield gid, converted_ali else: n_not_pass += 1 # check that hmmalign/cmalign finished correctly align_cmd.stdout.close() return_code = align_cmd.wait() if return_code: raise ValueError("[E::align] Error. hmmalign/cmalign failed.") # check that converting the file worked correctly parse_cmd.stdout.close() return_code = parse_cmd.wait() if return_code: raise ValueError("[E::align] Error. esl-reformat failed.") # print the number of sequences that were filtered if verbose > 3: print(f" Number of sequences that pass the filter: {n_pass}", file=sys.stderr) print(f" Number of sequences that do not pass the filter: {n_not_pass}", file=sys.stderr)
def __init__(self): if not is_tool("prodigal"): raise ValueError("[E::align] Error: prodigal is not in the path.\n")