def run_shorts(folder): fixed = 0 problematic = ["Ca21chr1-orf19.4923.1","Ca21chr3-orf19.6828.1","Ca21chr5-orf19.961.2","Ca21chrR-orf19.1738.1","Ca21chrR-orf19.4380.1"] for filename in os.listdir(folder): if filename in problematic: new = "/sternadi/home/volume2/ella/"+ filename + "/" + filename + ".fasta" print(new) os.makedirs("/sternadi/home/volume2/ella/"+ filename) os.rename("/sternadi/home/volume2/ella/Candida/Genes/NoRef/"+filename + "/" + filename + ".fasta", new) seqFileTools.convert_fasta_to_phylip(new, outfile=None) M8a_CTL = new.replace(".fasta", "-M8a.CLT") M8_CTL = new.replace(".fasta", "-M8.CLT") PAML_utilities.candida_write_ctl_codeml_file( M8a_CTL, new.replace(".fasta", ".phy"), "/sternadi/home/volume2/ella/Candida/Trees/SNP_seq_without_ref.phy_phyml_tree.txt", new.replace(".fasta", "-M8a-Results.txt"), 1) PAML_utilities.candida_write_ctl_codeml_file( M8_CTL, new.replace(".fasta", ".phy"), "/sternadi/home/volume2/ella/Candida/Trees/SNP_seq_without_ref.phy_phyml_tree.txt", new.replace(".fasta", "-M8-Results.txt"), 0) candida_codeml_runner(M8a_CTL, M8_CTL, filename) fixed += 1 print("created shorts for: " + str(fixed))
def create_files_for_genes(folder): for filename in os.listdir(folder): if filename.endswith(".fasta"): file = open("/sternadi/home/volume2/ella/Candida/Genes/" + filename, "r") #skip the reference gene file.readline() file.readline() folder = "/sternadi/home/volume2/ella/Candida/Genes/NoRef/" + filename[:-6] os.makedirs(folder) output = open(folder + "/" + filename, "w") # write all sequences with A in the begining of the name, and without the stop codon at the end for line in file: if ">" in line: output.write(line.replace(">",">A")) else: output.write(line[:-4].upper() + "\n") output.close() file.close() #create phylip seqFileTools.convert_fasta_to_phylip( folder + "/" + filename, outfile=None) #create CLT M8a_CTL = folder + "/" + filename.replace(".fasta", "-M8a.CLT") M8_CTL = folder + "/" + filename.replace(".fasta", "-M8.CLT") PAML_utilities.candida_write_ctl_codeml_file( M8a_CTL, folder + "/" + filename.replace(".fasta", ".phy"), "/sternadi/home/volume2/ella/Candida/Trees/SNP_seq_without_ref.phy_phyml_tree.txt", folder + "/" + filename.replace(".fasta", "-M8a-Results.txt"), 1) PAML_utilities.candida_write_ctl_codeml_file( M8_CTL, folder + "/" + filename.replace(".fasta", ".phy"), "/sternadi/home/volume2/ella/Candida/Trees/SNP_seq_without_ref.phy_phyml_tree.txt", folder + "/" + filename.replace(".fasta", "-M8-Results.txt"), 0) #run codeml candida_codeml_runner(M8a_CTL, M8_CTL, filename[:-6])
def main(): parser = OptionParser("usage: %prog [options]") parser.add_option("-i", "--input", dest="input", help="fasta sequence file") parser.add_option("-o", "--output", dest="output", help="phylip output file", default=None) (options, args) = parser.parse_args() input = options.input output = options.output input = check_filename(input) if output != None: ouput = check_filename(output, Truefile=False) output = convert_fasta_to_phylip(input, output) print("converted %s fasta file into %s" % (input, output))
def phyml_aa_runner(alignment, alias="phyml", phylip=True): """ run phyml on aa alignment on cluster :param alignment: alignment file path :param alias: job name (default: phyml) :param phylip: True if phylip file, False if fasta file :return: job id """ alignment = check_filename(alignment) if phylip == False: alignment = convert_fasta_to_phylip(alignment) cmdfile = pbs_jobs.get_cmdfile_dir("phyml", alias) tnum = 1 gmem = 2 cmds = "/sternadi/home/volume1/shared/tools/PhyML/PhyML_3.0_linux64 -i %s -d aa -q -b 0" % alignment pbs_jobs.create_pbs_cmd(cmdfile=cmdfile, alias=alias, jnum=tnum, gmem=gmem, cmds=cmds) job_id = pbs_jobs.submit(cmdfile) return job_id
def rerun_middle_stops(folder): """ This function runs over the original gene folder and for every gene that is in the list "run_again" it creates its gene folder empty, modifies the sequence so it has gaps instead of stop codons in the middle and then runs PAML again. After creating the new Fasta file it will confirm that there is a difference between the sequences. :param folder: The general Genes folder :return number of files changed """ cnt = 0 run_again = [ "Ca21chr2-orf19.813", "Ca21chr6-RBF1", "Ca21chr4-PGA31", "Ca21chr5-orf19.1935", "Ca21chr4-GST1", "Ca21chr2-orf19.894", "Ca21chr7-orf19.5139", "Ca21chr1-CHS3", "Ca21chr4-orf19.2680", "Ca21chr3-orf19.6008", "Ca21chr1-orf19.4984", "Ca21chr4-RAM1", "Ca21chr2-orf19.1768", "Ca21chr5-orf19.4337", "Ca21chrR-orf19.1737", "Ca21chrR-orf19.6382", "Ca21chr1-orf19.7278", "Ca21chr1-orf19.6209", "Ca21chr3-FGR23", "Ca21chr4-ZCF27", "Ca21chr4-ERG26", "Ca21chr5-orf19.937" ] for filename in os.listdir(folder): if filename[:-6] in run_again: seqs = [] cnt += 1 gene_folder = "/sternadi/home/volume2/ella/Candida/Genes/NoRef/" + filename[: -6] shutil.rmtree(gene_folder) file = open( "/sternadi/home/volume2/ella/Candida/Genes/" + filename, "r") #skip the reference gene file.readline() file.readline() os.makedirs(gene_folder) output = open(gene_folder + "/" + filename, "w") # write all sequences with A in the begining of the name, and without the stop codon at the end # also ommits the stop codons from the middle and completes it with gaps. for line in file: if ">" in line: output.write(line.replace(">", ">A")) else: new_line = return_gapped_seq(line[:-4].upper()) output.write(new_line + "\n") seqs += [new_line] output.close() file.close() #confirm all the sequences are not the same: if all_the_same(seqs): print("all the same in " + filename) continue #create phylip seqFileTools.convert_fasta_to_phylip(gene_folder + "/" + filename, outfile=None) #create CLT M8a_CTL = gene_folder + "/" + filename.replace( ".fasta", "-M8a.CLT") M8_CTL = gene_folder + "/" + filename.replace(".fasta", "-M8.CLT") PAML_utilities.candida_write_ctl_codeml_file( M8a_CTL, gene_folder + "/" + filename.replace(".fasta", ".phy"), "/sternadi/home/volume2/ella/Candida/Trees/SNP_seq_without_ref.phy_phyml_tree.txt", gene_folder + "/" + filename.replace(".fasta", "-M8a-Results.txt"), 1) PAML_utilities.candida_write_ctl_codeml_file( M8_CTL, gene_folder + "/" + filename.replace(".fasta", ".phy"), "/sternadi/home/volume2/ella/Candida/Trees/SNP_seq_without_ref.phy_phyml_tree.txt", gene_folder + "/" + filename.replace(".fasta", "-M8-Results.txt"), 0) #run codeml run_PAML.candida_codeml_runner(M8a_CTL, M8_CTL, filename[:-6]) print(str(cnt) + " files were ran again")