def paraAT(in_dir): nucleotide_dir_path = in_dir / "nucleotide" homolog_dir_path = in_dir / "homolog" protein_dir_path = in_dir / "aminoacid" out_dir_path = directory_creater(in_dir / "ParaAT_out") ParaAT_stdout = directory_creater(in_dir / "ParaAT_stdout") ParaAT_stderr = directory_creater(in_dir / "ParaAT_stderr") cmds_list = [] for gene_id in nucleotide_dir_path.iterdir(): f = NamedTemporaryFile('w+t', delete=False) f.write('6') cmds_list.append(([ "/mnt/d/zhes_learning_space/software_in_ubuntu/ParaAT2.0/ParaAT.pl", "-h", str(homolog_dir_path / (gene_id.stem + ".txt")), "-n", str(nucleotide_dir_path / (gene_id.stem + ".fasta")), "-a", str(protein_dir_path / (gene_id.stem + ".fasta")), "-p", f.name, "-m", "muscle", "-f", "axt", "-g", "-k", "-o", str(out_dir_path / gene_id.stem) ], gene_id.stem)) procs_list = [ subprocess.Popen(cmd[0], stdout=(ParaAT_stdout / (cmd[1] + ".txt")).open('w'), stderr=(ParaAT_stderr / (cmd[1] + ".txt")).open('w')) for cmd in cmds_list ] for proc in procs_list: proc.wait()
def extract_ortholog_gene(ortholog_blast_path_name): ''' input 1: strain_db_dir_path input 2: contig_path input 3: orthogroup_sequence_path output 1: ortholog_blast_path_name ''' general_out_path = Path(ortholog_blast_path_name) blast_general = general_out_path / "blast_general" if blast_general.exists() is False: blast_general.mkdir() all_row_gene_dir_path = directory_creater(general_out_path / "all_row_gene") all_row_gene_fasta_dir = directory_creater(all_row_gene_dir_path / "fasta") all_row_gene_list_dir = directory_creater(all_row_gene_dir_path / "list") all_row_gene_list_dir_all = directory_creater(all_row_gene_dir_path / "all_list") blast_db_path = blast_general / "blast_db" if blast_db_path.exists() is False: blast_db_path.mkdir() blast_out_path = blast_general / "blast_out" if blast_out_path.exists() is False: blast_out_path.mkdir() blast_out_xml_path = blast_out_path / "blast_out_xml" if blast_out_xml_path.exists() is False: blast_out_xml_path.mkdir() blast_out_asn_path = blast_out_path / "blast_out_asn" if blast_out_asn_path.exists() is False: blast_out_asn_path.mkdir() blast_out_txt_path = blast_out_path / "blast_out_txt" if blast_out_txt_path.exists() is False: blast_out_txt_path.mkdir() blast_identity_value_dir = blast_general / "blast_identity_value" if blast_identity_value_dir.exists() is False: blast_identity_value_dir.mkdir() base = importr("base") utils = importr("utils") # R_blast_vlaue_list=base.list() # Parallel(n_jobs=12)(delayed(generate_row_list)(orthogroup_file,all_row_gene_list_dir_all) for orthogroup_file in orthogroup_sequence_path.iterdir() # Parallel(n_jobs=12)(delayed(extract_gene_gff)(orthogroup_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for orthogroup_file in orthogroup_sequence_path.iterdir()) calls = [] no_mgg_fl = (general_out_path / "no_MGG.txt").open('w') for all_row_gene_fasta_file in all_row_gene_fasta_dir.iterdir(): calls.append( delayed(ortholog_blast)( all_row_gene_fasta_file, blast_identity_value_dir, # no_mgg_fl, utils.read_table, utils.write_table, blast_db_path, blast_out_xml_path, blast_out_asn_path, blast_out_txt_path, base.list, base.do_call)) dask.compute(*calls)
def blast(species_path, query_file_1, species_out_path_name, pav_excel_name): ''' input:1 contig input 3 blastn query file,pan gene intermediate_out: species_out_path=Path('../Pan_genome_data/c_blast_present_contig/') output 1: pav_excel name ''' global pan_sh global query_file query_file = query_file_1 species_out_path = Path(species_out_path_name) excel_book = Workbook() pan_sh = excel_book.active species_count = 1 global species_db_dir, species_out_xml_dir, species_out_asn_dir, species_out_txt_dir species_db_dir = directory_creater(species_out_path / "blastdb") species_out_xml_dir = directory_creater(species_out_path / "xml") species_out_asn_dir = directory_creater(species_out_path / "asn") species_out_txt_dir = directory_creater(species_out_path / "txt") Parallel(n_jobs=1)(delayed(run_blast)(i) for i in species_path.glob("1106.2.fasta")) for species_file in species_path.glob("1106.2.fasta"): # print (str(species_file)+"\n") species_count = species_count + 1 species_name = species_file.stem excel_species_name(species_count + 1, species_name) with open(species_out_xml_dir / (species_name + ".xml")) as fl: for record in NCBIXML.parse(fl): gene_name = record.query.split()[0] if record.alignments: max_flag = -1 #out_to_excel(species_count,record.query,1) for alignment in record.alignments: for hsp in alignment.hsps: if max_flag == -1: identity_discriminant_for_length = hsp.align_length / record.query_length identity_discriminant_for_identity_perscent = hsp.identities / hsp.align_length max_flag = max_flag + 2 if hsp.align_length < 100 and identity_discriminant_for_length < 0.5: out_to_excel(species_count, gene_name, 0) elif identity_discriminant_for_length == 1 and identity_discriminant_for_identity_perscent == 1: out_to_excel(species_count, gene_name, 4) elif identity_discriminant_for_length == 1 and hsp.gaps == 0: out_to_excel(species_count, gene_name, 3) elif hsp.align_length > record.query_length or record.query_length - hsp.align_length <= 50: out_to_excel(species_count, gene_name, 2) else: out_to_excel(species_count, gene_name, 1) else: out_to_excel(species_count, gene_name, 0) excel_book.save(pav_excel_name)
def prepare_for_ParaAT(joined_df_file_name,coding_gene_base_file_path,protein_base_file_path,out_dir): ''' input 1: joined_df_file_name input 2: coding_gene_base_file_path input 3: protein_base_file_path output 1: out_dir ''' global coding_gene_base,protein_base,gene_fasta_fl,protein_fasta_fl gene_dir_path=directory_creater(out_dir/"nucleotide") protein_dir_path=directory_creater(out_dir/"aminoacid") fasta_base_dir_path=directory_creater(out_dir/"gene_protein_base") homolog_dir_path=directory_creater(out_dir/"homolog") coding_gene_base=SeqIO.index( str(coding_gene_base_file_path), "fasta" ) protein_base=SeqIO.index( str(protein_base_file_path), "fasta" ) base=importr("base") utils=importr("utils") ortholog_joined_df=utils.read_table( joined_df_file_name, sep = "\t", header = True, **{'stringsAsFactors': False}, **{'check.names': False} ) ortholog_joined_df_sub=ortholog_joined_df.rx(True,-1) for i in range(1,(int(base.nrow(ortholog_joined_df)[0])+1)): df_row=ortholog_joined_df_sub.rx(i, True) df_row_iter=iter(df_row) head_list=next(df_row_iter)[0].split() if len(head_list)==1: gene_fasta=gene_dir_path/(head_list[0]+".fasta") protein_fasta=protein_dir_path/(head_list[0]+".fasta") homolog_file_path=homolog_dir_path/(head_list[0]+".txt") if gene_fasta.is_file() is True:continue with gene_fasta.open('w') as gene_fasta_fl: with protein_fasta.open('w') as protein_fasta_fl: with homolog_file_path.open('w') as homolog_fl: extract_gene(head_list[0]) homolog_fl.write(head_list[0]+"\t") for homolog_id in one_head(df_row_iter): homolog_fl.write(homolog_id+"\t") homolog_fl.write("\n") else: two_head(head_list,df_row_iter)
def format_paraAT_parameter(in_dir): nucleotide_dir_path=in_dir/"nucleotide" homolog_dir_path=in_dir/"homolog" protein_dir_path=in_dir/"aminoacid" out_dir_path=directory_creater(in_dir/"ParaAT_out") for gene_id in nucleotide_dir_path.iterdir(): f=NamedTemporaryFile('w+t',delete=False) f.write('1') yield ([ "/mnt/d/zhes_learning_space/software_in_ubuntu/ParaAT2.0/ParaAT.pl", "-h", str(homolog_dir_path/(gene_id.stem+".txt")), "-n", str(nucleotide_dir_path/(gene_id.stem+".fasta")), "-a", str(protein_dir_path/(gene_id.stem+".fasta")), "-p", f.name, "-m", "muscle", "-f", "axt", # "-g", "-k", "-o", str(out_dir_path/gene_id.stem) ], gene_id.stem )
def extract_ortholog_gene(strain_db_dir_path,contig_path,joined_df_file_name,ortholog_blast_path_name): ''' input 1: strain_db_dir_path input 2: contig_path input 3: joined_df_file_name output 1: ortholog_blast_path_name ''' general_out_path=Path(ortholog_blast_path_name) blast_general=general_out_path / "blast_general" if blast_general.exists() is False: blast_general.mkdir() all_row_gene_dir_path=directory_creater(general_out_path/"all_row_gene") all_row_gene_fasta_dir=directory_creater(all_row_gene_dir_path/"fasta") all_row_gene_list_dir=directory_creater(all_row_gene_dir_path/"list") all_row_gene_list_dir_all=directory_creater(all_row_gene_dir_path/"all_list") blast_db_path=blast_general/"blast_db" if blast_db_path.exists() is False: blast_db_path.mkdir() blast_out_path=blast_general/"blast_out" if blast_out_path.exists() is False: blast_out_path.mkdir() blast_out_xml_path=blast_out_path/"blast_out_xml" if blast_out_xml_path.exists() is False: blast_out_xml_path.mkdir() blast_out_asn_path=blast_out_path/"blast_out_asn" if blast_out_asn_path.exists() is False: blast_out_asn_path.mkdir() blast_out_txt_path=blast_out_path/"blast_out_txt" if blast_out_txt_path.exists() is False: blast_out_txt_path.mkdir() blast_identity_value_dir=blast_general/"blast_identity_value" if blast_identity_value_dir.exists() is False: blast_identity_value_dir.mkdir() base=importr("base") utils=importr("utils") ortholog_joined_df=utils.read_table( str(joined_df_file_name), sep = "\t", header = True, **{'stringsAsFactors': False}, **{'check.names': False} ) na_count=0 R_blast_vlaue_list=base.list() Parallel(n_jobs=12)(delayed(generate_row_list)(ortholog_joined_df.rx(i, True),all_row_gene_list_dir,all_row_gene_list_dir_all) for i in range(1,(int(base.nrow(ortholog_joined_df)[0])+1))) Parallel(n_jobs=12)(delayed(extract_gene_gff)(all_row_gene_list_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for all_row_gene_list_file in all_row_gene_list_dir_all.iterdir())
def calculate_nucleotide_diversity(gene_category_dir_name, output_dir): ''' input 1: gene category dir path output 1: nucleotide_diversity_dir ''' global MGG_db MGG_db = gffutils.FeatureDB( "../Pan_genome_data/ortholog/gffutils_db/MGGdb.db") gene_category_dir_path = Path(gene_category_dir_name) category_bed_dir_path = directory_creater(output_dir / "category_bed") pi_result_dir_path = directory_creater(output_dir / "pi_result") pi_std_out_err_dir_path = directory_creater(output_dir / "pi_std_out_err") for category_file in gene_category_dir_path.iterdir(): bed_file_path = category_bed_dir_path / (category_file.stem + ".bed") if bed_file_path.is_file() is False: pybedtools.BedTool(generate_interval( category_file.open())).saveas(bed_file_path) vcftool( bed_file_path, pi_result_dir_path / category_file.stem, pi_std_out_err_dir_path / (category_file.stem + "_out_err.txt"))
def CombineGVCFs(snp_dir): ''' input 1:snp_dir generate by call snp ''' vcfdb_dir = directory_creater(snp_dir / "CombineGVCFs_vcf") std_out_err = directory_creater(snp_dir / "intermediate_files" / "CombineGVCFs_std_out_err") call_list = [ "/mnt/d/zhes_learning_space/software_in_ubuntu/gatk-4.1.8.0/gatk", "--java-options", "-Xmx20g -Xms20g", "CombineGVCFs", "-R", "/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/70-15_refference_genome/70-15_supercontigs.fasta", "-O", str(vcfdb_dir / "pan_genome_CombineGVCFs.vcf") ] for vcf_file in gvcf_dir_path.glob("*.vcf"): call_list.append("-V") call_list.append(str(vcf_file)) call_CombineGVCFs = subprocess.Popen( call_list, stdout=(std_out_err / "CombineGVCFs_stdout.txt").open('w'), stderr=(std_out_err / "CombineGVCFs_stderr.txt").open('w')) call_CombineGVCFs.wait()
def GenotypeGVCFs(snp_dir): input_vcf_file = snp_dir / "CombineGVCFs_vcf" / "pan_genome_CombineGVCFs.vcf" output_vcf_file = snp_dir / "call_snp_gatk.vcf" std_out_err_dir = directory_creater(snp_dir / "intermediate_files" / "GenotypeGVCFs_out_err") call_GenotypeGVCFs = subprocess.Popen([ "/mnt/d/zhes_learning_space/software_in_ubuntu/gatk-4.1.8.0/gatk", "--java-options", "-Xmx20g -Xms20g", "GenotypeGVCFs", "-R", "/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/70-15_refference_genome/70-15_supercontigs.fasta", "-V", str(input_vcf_file), "-O", str(output_vcf_file) ], stdout=(std_out_err_dir / "GenotypeGVCFs_out.txt" ).open('w'), stderr=(std_out_err_dir / "GenotypeGVCFs_err.txt" ).open('w')) call_GenotypeGVCFs.wait()
def GenomicsDBImport(snp_dir): ''' input 1:snp_dir generate by call snp ''' gvcf_dir_path = snp_dir / "gvcf" vcfdb_dir = snp_dir / "vcfdb" std_out_err = directory_creater(snp_dir / "intermediate_files" / "GenomicsDBImport_std_out_err") call_list = [ "/mnt/d/zhes_learning_space/software_in_ubuntu/gatk-4.1.8.0/gatk", "--java-options", "-Xmx20g -Xms20g", "GenomicsDBImport", "--genomicsdb-workspace-path", str(vcfdb_dir) ] for vcf_file in gvcf_dir_path.glob("*.vcf"): call_list.append("-V") call_list.append(str(vcf_file)) call_GenomicsDBImport = subprocess.Popen( call_list, stdout=(std_out_err / "GenomicsDBImport_stdout.txt").open('w'), stderr=(std_out_err / "GenomicsDBImport_stderr.txt").open('w')) call_GenomicsDBImport.wait()
def pav_contig_present_blast_gth_main (species_path,query_file_1,species_out_path_name,pav_excel_name): ''' input:1 contig input 3 blastn query file,pan gene intermediate_out: species_out_path=Path('../Pan_genome_data/c_blast_present_contig/') output 1: pav_excel name ''' global query_file query_file=query_file_1 species_out_path=Path(species_out_path_name) global species_db_dir,species_out_xml_dir,species_out_asn_dir,species_out_txt_dir species_db_dir=directory_creater(species_out_path/"blastdb") species_out_xml_dir=directory_creater(species_out_path/"xml") species_out_asn_dir=directory_creater(species_out_path/"asn") species_out_txt_dir=directory_creater(species_out_path/"txt") species_out_gth_dir=directory_creater(species_out_path/"gth_out") species_out_gth_stdout_dir=directory_creater(species_out_path/"gth_stdout") gff_db_dir_path=directory_creater(species_out_path/"gff_db") # Parallel(n_jobs=12)(delayed(run_blast)(query_file,i,species_out_path) for i in species_path.glob("*.fasta")) # Parallel(n_jobs=12)(delayed(run_gth)(query_file,i,species_out_path) for i in species_path.glob("*.fasta")) # Parallel(n_jobs=12)(delayed(update_gff_db)(i,gff_db_dir_path) for i in species_out_gth_dir.glob("*.gff")) excel(species_out_xml_dir,species_out_path,pav_excel_name)
def extract_ortholog_gene(strain_db_dir_path,contig_path,joined_df_file_name,ortholog_blast_path_name): ''' input 1: strain_db_dir_path input 2: contig_path input 3: joined_df_file_name output 1: ortholog_blast_path_name ''' general_out_path=Path(ortholog_blast_path_name) blast_general=general_out_path / "blast_general" if blast_general.exists() is False: blast_general.mkdir() all_row_gene_dir_path=directory_creater(general_out_path/"all_row_gene") all_row_gene_fasta_dir=directory_creater(all_row_gene_dir_path/"fasta") all_row_gene_list_dir=directory_creater(all_row_gene_dir_path/"list") all_row_gene_list_dir_all=directory_creater(all_row_gene_dir_path/"all_list") blast_db_path=blast_general/"blast_db" if blast_db_path.exists() is False: blast_db_path.mkdir() blast_out_path=blast_general/"blast_out" if blast_out_path.exists() is False: blast_out_path.mkdir() blast_out_xml_path=blast_out_path/"blast_out_xml" if blast_out_xml_path.exists() is False: blast_out_xml_path.mkdir() blast_out_asn_path=blast_out_path/"blast_out_asn" if blast_out_asn_path.exists() is False: blast_out_asn_path.mkdir() blast_out_txt_path=blast_out_path/"blast_out_txt" if blast_out_txt_path.exists() is False: blast_out_txt_path.mkdir() blast_identity_value_dir=blast_general/"blast_identity_value" if blast_identity_value_dir.exists() is False: blast_identity_value_dir.mkdir() lower_blast_value_dir_path=directory_creater(general_out_path/"lower_blast_value") best_sequence_id_list_dir_path=directory_creater(general_out_path/"best_sequence_id_list") single_copy_fasta_dir_path=directory_creater(general_out_path.parent/"MAFFT_ortholog_MGG"/"in_put_fasta") base=importr("base") utils=importr("utils") # na_count=0 # R_blast_vlaue_list=base.list() # # client = Client(processes=False) # generate_row_list_calls=[] # extract_gene_gff_calls=[] # for i in range(1,(int(base.nrow(ortholog_joined_df)[0])+1)): # df_row=ortholog_joined_df.rx(i, True) # df_row_iter=iter(df_row[2:]) # head_id=df_row.rx(1,2)[0] # if re.search(",",head_id) is not None: # continue # all_row_gene_fasta_file=all_row_gene_fasta_dir / (head_id+"_"+"_all_row_gene_fasta.fasta") # generate_row_list_calls.append(dask.delayed(generate_row_list)( # df_row_iter, # head_id, # all_row_gene_list_dir, # all_row_gene_list_dir_all # )) # extract_gene_gff_calls.append(dask.delayed(extract_gene_gff)( # all_row_gene_list_dir_all/(head_id+".txt"), # strain_db_dir_path, # contig_path, # all_row_gene_fasta_dir # )) # dask.compute(*generate_row_list_calls) # dask.compute(*extract_gene_gff_calls) ''' generate_row_list ''' # ortholog_joined_df=utils.read_table( # str(joined_df_file_name), # sep = "\t", # header = True, # **{'stringsAsFactors': False}, # **{'check.names': False} # ) # head_id_list=[] # strain_num_list=[] # Parallel(n_jobs=1)(delayed(generate_row_list)( # ortholog_joined_df.rx(i, True), # all_row_gene_list_dir, # all_row_gene_list_dir_all, # head_id_list, # strain_num_list # ) for i in range(1,(int(base.nrow(ortholog_joined_df)[0])+1))) # with (general_out_path/"MGG_strain_num.txt").open('w') as MGG_strain_num_fl: # for head_id_1,strain_num_1 in zip(head_id_list,strain_num_list): # MGG_strain_num_fl.write("{}\t{}\n".format(head_id_1,strain_num_1)) ''' extract_gene_gff ''' # Parallel(n_jobs=12)(delayed(extract_gene_gff)(all_row_gene_list_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for all_row_gene_list_file in all_row_gene_list_dir_all.iterdir()) # Parallel(n_jobs=1)(delayed(extract_gene_gff)(all_row_gene_list_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for all_row_gene_list_file in all_row_gene_list_dir_all.glob("MGG_16565T0*")) ''' ortholog_blast ''' # output_list=[] # lineList=[] # rdocall=base.do_call # rread_table=utils.read_table # rwrite_table=utils.write_table # rlist=base.list # rlength=base.length # only_itself_list=[] # length_0_list=[] # no_mgg_list=[] # for all_row_gene_fasta_file in all_row_gene_fasta_dir.glob("MGG_01742T0*"): # # for all_row_gene_fasta_file in all_row_gene_fasta_dir.iterdir(): # R_blast_vlaue_df=dask.delayed(ortholog_blast)( # # R_blast_vlaue_df=ortholog_blast( # all_row_gene_fasta_file, # # all_row_gene_fasta_dir, # blast_identity_value_dir, # only_itself_list, # length_0_list, # no_mgg_list, # rread_table, # rwrite_table, # blast_db_path, # blast_out_xml_path, # blast_out_asn_path, # blast_out_txt_path, # rlist, # rlength, # rdocall # ) # output_list.append(R_blast_vlaue_df) # total=dask.delayed(merge_df)(output_list,base.list,utils.write_table,base.do_call,general_out_path) # # total.visualize() # total.compute() ''' parse_blast_result ''' # mem=Memory(general_out_path/"parse_blast_result_cache") # parse_blast_result_mem=mem.cache(parse_blast_result,verbose=0) # reddd=Parallel(n_jobs=10)(delayed(parse_blast_result_mem)( # all_row_gene_list_dir/(blast_identity_value_tsv.stem+"_singlecopy_list.txt"), # all_row_gene_list_dir/(blast_identity_value_tsv.stem+"_multicopy_list.txt"), # blast_identity_value_tsv, # lower_blast_value_dir_path/(blast_identity_value_tsv.stem+".txt"), # best_sequence_id_list_dir_path/(blast_identity_value_tsv.stem+".txt") # ) for blast_identity_value_tsv in blast_identity_value_dir.iterdir()) # # ) for blast_identity_value_tsv in blast_identity_value_dir.glob("MGG_00010T0*")) # with (general_out_path/"best_strain_num.txt").open('w') as best_strain_num_fl: # for head_id_1,strain_num_1 in reddd: # best_strain_num_fl.write("{}\t{}\n".format(head_id_1,strain_num_1)) ''' extract_gene ''' Parallel(n_jobs=12)(delayed(extract_gene)( best_sequence_id_list, all_row_gene_fasta_dir/(best_sequence_id_list.stem+".fasta"), single_copy_fasta_dir_path/(best_sequence_id_list.stem+".fasta") ) for best_sequence_id_list in best_sequence_id_list_dir_path.iterdir())
def call_snp_HaplotypeCaller(id_file, contig_dir_name, out_dir): ''' input 1: strain 95 file input 2: contig_dir_name output 1: out dir ''' global mummer_stderr_dir, mummer_stdout_dir, contig_dir_path, sam_out_dir sam_out_dir = directory_creater(out_dir / "mummer_sam_files") raw_bam_out_dir = directory_creater(out_dir / "raw_bam_files") RG_bam_dir = directory_creater(out_dir / "RG_bam_files") intermediate_dir = directory_creater(out_dir / "intermediate_files") mummer_stderr_dir = directory_creater(intermediate_dir / "mummer_err") mummer_stdout_dir = directory_creater(intermediate_dir / "mummer_out") samview_stderr_dir = directory_creater(intermediate_dir / "sam_view_err") add_RG_stdout_dir = directory_creater(intermediate_dir / "add_RG_out") add_RG_stderr_dir = directory_creater(intermediate_dir / "add_RG_err") strain_95_list = extract_strain_id(id_file) strain_95_list.append("ina168") strain_95_list.remove("magnaporthe_oryzae_70-15_8_proteins_T0") contig_dir_path = Path(contig_dir_name) replace_header_sam_dir = directory_creater(intermediate_dir / "replace_header_sam") bam_sort_dir_path = directory_creater(intermediate_dir / "bam_sort") bam_sort_stdout_dir_path = directory_creater(intermediate_dir / "bam_sort_out") bam_sort_stderr_dir_path = directory_creater(intermediate_dir / "bam_sort_err") bam_index_stdout_dir_path = directory_creater(intermediate_dir / "bam_index_out") bam_index_stderr_dir_path = directory_creater(intermediate_dir / "bam_index_err") gvcf_dir = directory_creater(out_dir / "gvcf") haplotypecaller_stdout_dir_path = directory_creater( intermediate_dir / "haplotypecaller_stdout") haplotypecaller_stderr_dir_path = directory_creater( intermediate_dir / "haplotypecaller_stderr") for strain_95_id in strain_95_list: mummer_sam_out = sam_out_dir / (strain_95_id + ".sam") if mummer_sam_out.is_file() is False: mummer_call(strain_95_id, str(mummer_sam_out)) replace_header_sam_file_path = replace_header_sam_dir / (strain_95_id + ".sam") if replace_header_sam_file_path.is_file() is False: replace_header(mummer_sam_out, replace_header_sam_file_path) raw_bam_file_path = raw_bam_out_dir / (strain_95_id + ".bam") if raw_bam_file_path.is_file() is False: sam2bam(replace_header_sam_file_path, raw_bam_file_path, samview_stderr_dir / (strain_95_id + "_samview_err.txt")) RG_bam_file_path = RG_bam_dir / (strain_95_id + ".bam") if RG_bam_file_path.is_file() is False: add_RG(raw_bam_file_path, RG_bam_file_path, add_RG_stdout_dir / (strain_95_id + "_out.txt"), add_RG_stderr_dir / (strain_95_id + "_err.txt")) bam_RG_sort_file_path = bam_sort_dir_path / (strain_95_id + "_sort.bam") if bam_RG_sort_file_path.is_file() is False: bam_sort(RG_bam_file_path, bam_RG_sort_file_path, bam_sort_stdout_dir_path / (strain_95_id + "_out.txt"), bam_sort_stderr_dir_path / (strain_95_id + "_err.txt")) bam_index(bam_RG_sort_file_path, bam_index_stdout_dir_path / (strain_95_id + "_out.txt"), bam_index_stderr_dir_path / (strain_95_id + "_err.txt")) gvcf_file_path = gvcf_dir / (strain_95_id + ".vcf") if gvcf_file_path.is_file() is False: haplotypecaller( bam_RG_sort_file_path, gvcf_file_path, haplotypecaller_stdout_dir_path / (strain_95_id + "_out.txt"), haplotypecaller_stderr_dir_path / (strain_95_id + "_err.txt"))
# draw_stack() # set_minus(str(R_result)+"/") # drawer_curve(str(R_result)+"/") pav_with_cluster_out_file_name = "../Pan_genome_data/pav_with_cluster.tsv" clade_set = ["1", "2", "3", "4"] color_set = "Set2" cluster_clade_file_name = "../Pan_genome_data/R_result/strain_clade_category_ID.txt" # cluster(pav_excel,clade_set,color_set,pav_with_cluster_out_file_name,cluster_clade_file_name,str(R_result)+"/") # write_cluster_result() # set_minus_with_cluster(str(R_result)+"/",cluster_clade_file_name,clade_set,color_set) orthofinder_assianed_tsv_file_name = "../Pan_genome_data/Results_May22_1/Orthogroups/Orthogroups.tsv" orthofinder_unassianed_tsv_file_name = "../Pan_genome_data/Results_May22_1/Orthogroups/Orthogroups_UnassignedGenes.tsv" set_minus_orthofinder_result = directory_creater( "../Pan_genome_data/set_minus_orthofinder_result") pav_orthofinder_file_name = "../Pan_genome_data/set_minus_orthofinder_result/pav_orthofinder.xlsx" # R_set_minus_cut_orthofinder(orthofinder_assianed_tsv_file_name,orthofinder_unassianed_tsv_file_name,pan_id_file,str(set_minus_orthofinder_result)+"/") # set_minus(str(set_minus_orthofinder_result)+"/") # drawer_curve(str(set_minus_orthofinder_result)+"/") MGG_70_15 = "../../70-15_refference_genome/magnaporthe_oryzae_70-15_8_genes.fasta" Augustus_70_15 = "../../GFF/70-15_gene.fasta" MGG_Augustus_70_15_dir = directory_creater( "../Pan_genome_data/70-15_MGG_Augustus") MGG_Augustus_70_15_blast_db_dir = directory_creater(MGG_Augustus_70_15_dir / "blast_db") blast_out_asn_file = MGG_Augustus_70_15_dir / "blast_out.asn" blast_out_xml_file = MGG_Augustus_70_15_dir / "blast_out.xml" blast_out_txt_file = MGG_Augustus_70_15_dir / "blast_out.txt" MGG_unpresent_Augustus = MGG_Augustus_70_15_dir / "MGG_unpresent_Augustus_list.txt"
def extract_ortholog_gene(gene_base_name, joined_df_file_name, id_file, ortholog_blast_path_name): ''' input 1: gene_base_name input 2: joined_df_file_name input 3: id_file output 1: ortholog_blast_path_name ''' global gene_base gene_base = SeqIO.index(gene_base_name, "fasta", key_function=get_id_gene) blast_path = Path(ortholog_blast_path_name) global blast_general, blast_per_MGG_gene, blast_body_sequence_path, blast_head_sequence_path, blast_db_path, blast_out_path, blast_out_xml_path, blast_out_txt_path, blast_sequence, fst_sequence, blast_result, blast_out_asn_path sequence_out_dir = directory_creater(blast_path.parent / "sequence_out") blast_general = blast_path / "blast_general" if blast_general.exists() is False: blast_general.mkdir() blast_per_MGG_gene = blast_path / "blast_per_MGG_gene" if blast_per_MGG_gene.exists() is False: blast_per_MGG_gene.mkdir() blast_body_sequence_path = blast_path / "blast_body_sequence" if blast_body_sequence_path.exists() is False: blast_body_sequence_path.mkdir() blast_head_sequence_path = blast_path / "blast_head_sequence" if blast_head_sequence_path.exists() is False: blast_head_sequence_path.mkdir() blast_db_path = blast_path / "blast_db" if blast_db_path.exists() is False: blast_db_path.mkdir() blast_out_path = blast_path / "blast_out" if blast_out_path.exists() is False: blast_out_path.mkdir() blast_out_xml_path = blast_out_path / "blast_out_xml" if blast_out_xml_path.exists() is False: blast_out_xml_path.mkdir() blast_out_asn_path = blast_out_path / "blast_out_asn" if blast_out_asn_path.exists() is False: blast_out_asn_path.mkdir() blast_out_txt_path = blast_out_path / "blast_out_txt" if blast_out_txt_path.exists() is False: blast_out_txt_path.mkdir() blast_sequence = blast_path / "blast_sequence" if blast_sequence.exists() is False: blast_sequence.mkdir() fst_sequence = sequence_out_dir / "fst_sequence" if fst_sequence.exists() is False: fst_sequence.mkdir() global fst_sequence_two_head fst_sequence_two_head = sequence_out_dir / "fst_sequence_two_head" if fst_sequence_two_head.exists() is False: fst_sequence_two_head.mkdir() blast_result = blast_path / "blast_result" if blast_result.exists() is False: blast_result.mkdir() global blast_exception_path_name blast_exception_path_name = directory_creater(blast_path / "blast_exception") global gffutils_db_dir_path_name gffutils_db_dir_path_name = directory_creater(blast_path.parent / "gffutils_db") global json_dir_path_name json_dir_path_name = Path("../Pan_genome_data/contig_length_json/") global contig_dir_path_name contig_dir_path_name = Path("../../contig/") global gff_path_name gff_path_name = Path("../../GFF/") gff_out_path = directory_creater(sequence_out_dir / "gene_slop_1K_gff") global fst_sequence_slop_1K_dir fst_sequence_slop_1K_dir = directory_creater(sequence_out_dir / "fst_sequence_slop_1K") global protein_id_vs_chrom_dir protein_id_vs_chrom_dir = directory_creater(sequence_out_dir / "protein_id_vs_chrom") species_95_list = extract_strain_id(id_file) species_95_list.append("ina168") species_95_list.remove("magnaporthe_oryzae_70-15_8_proteins_T0") global_names = globals() for json_file in json_dir_path_name.iterdir(): with json_file.open() as json_fl: global_names[json_file.stem + "_json"] = json.load(json_fl) for strain_id in species_95_list: for gff_file in gff_path_name.glob(strain_id + ".gff"): gff_db_name = gffutils_db_dir_path_name / (strain_id + ".db") if gff_db_name.is_file() is False: gffutils.create_db(str(gff_file), str(gff_db_name), force=True, id_spec=None) global_names[strain_id + "_db"] = gffutils.FeatureDB(gff_db_name) MGG_db_file_path = gffutils_db_dir_path_name / "MGGdb.db" if MGG_db_file_path.is_file() is False: gffutils.create_db( "../../70-15_refference_genome/70-15_Gff/magnaporthe_oryzae_70-15_8_genome_summary_per_gene_amend.txt", str(MGG_db_file_path), id_spec=':source:') global MGG_db MGG_db = gffutils.FeatureDB(MGG_db_file_path) global strain_protein_id_pattern strain_protein_id_pattern = re.compile("(.+)_protein_(.+)_") base = importr("base") utils = importr("utils") ortholog_joined_df = utils.read_table(joined_df_file_name, sep="\t", header=True, **{'stringsAsFactors': False}, **{'check.names': False}) blast_general_same_strain_list = blast_general / "same_strain_list.txt" blast_general_value_list = blast_general / "value_list.txt" global blast_general_same_strain_list_fl, blast_general_value_list_fl ortholog_joined_df_sub = ortholog_joined_df.rx(True, -1) with blast_general_same_strain_list.open( 'w+') as blast_general_same_strain_list_fl: with blast_general_value_list.open( 'w+') as blast_general_value_list_fl: for i in range(1, (int(base.nrow(ortholog_joined_df)[0]) + 1)): df_row = ortholog_joined_df_sub.rx(i, True) df_row_iter = iter(df_row) head_list = next(df_row_iter)[0].split() if len(head_list) == 1: one2one(head_list[0], df_row_iter) bedtool_file = pybedtools.BedTool(slop_list2gff()).saveas( gff_out_path / (head_list[0] + ".gff")) else: two_head(head_list, df_row_iter)
# blast_identity_value_dir/(head_id+".tsv"), # utils.read_table, # utils.write_table, # blast_db_path, # blast_out_xml_path, # blast_out_asn_path, # blast_out_txt_path, # base # ) # R_blast_vlaue_list.rx2[i]=robjects.DataFrame(R_blast_vlaue_df) # i=i+1 # #检查结果不是all true?multi_copy_gene_Vector中是否有异常值,blast结果中检查只用一个hsp行不行 # # best_blast_value_list=list(R_parse_blast_result.parse_blast_result( # # single_copy_gene_list, # # multi_copy_gene_list, # # R_blast_vlaue_df, # # str(blast_general/(head_id+"_single_err.txt")) # # )) # # extract_gene(best_blast_value_list,all_row_gene_fasta_file,blast_general/(head_id+"_single_copy.fasta")) # R_blast_vlaue_df=base.do_call("rbind",R_blast_vlaue_list) # utils.write_table(R_blast_vlaue_df,**{'file': str(general_out_path/"all.txt")},**{'append': False},**{'quote': False},**{'sep': "\t"},**{'row.names': False},**{'col.names': True}) # extract_ortholog_gene(directory_creater("/gpfshome/home/Baojd/wangzhe/ortho_blast")) extract_ortholog_gene( "strain_db_dir_path", "contig_path", "orthogroup_sequence_path", "pav_file_path", 150, directory_creater("../Pan_genome_data_2/ortholog_blast_2/") )
def extract_ortholog_gene(strain_db_dir_path,contig_path,joined_df_file_name,ortholog_blast_path_name): ''' input 1: strain_db_dir_path input 2: contig_path input 3: joined_df_file_name output 1: ortholog_blast_path_name ''' general_out_path=Path(ortholog_blast_path_name) blast_general=general_out_path / "blast_general" if blast_general.exists() is False: blast_general.mkdir() all_row_gene_dir_path=directory_creater(general_out_path/"all_row_gene") all_row_gene_fasta_dir=directory_creater(all_row_gene_dir_path/"fasta") all_row_gene_list_dir=directory_creater(all_row_gene_dir_path/"list") all_row_gene_list_dir_all=directory_creater(all_row_gene_dir_path/"all_list") blast_db_path=blast_general/"blast_db" if blast_db_path.exists() is False: blast_db_path.mkdir() blast_out_path=blast_general/"blast_out" if blast_out_path.exists() is False: blast_out_path.mkdir() blast_out_xml_path=blast_out_path/"blast_out_xml" if blast_out_xml_path.exists() is False: blast_out_xml_path.mkdir() blast_out_asn_path=blast_out_path/"blast_out_asn" if blast_out_asn_path.exists() is False: blast_out_asn_path.mkdir() blast_out_txt_path=blast_out_path/"blast_out_txt" if blast_out_txt_path.exists() is False: blast_out_txt_path.mkdir() blast_identity_value_dir=blast_general/"blast_identity_value" if blast_identity_value_dir.exists() is False: blast_identity_value_dir.mkdir() base=importr("base") utils=importr("utils") # ortholog_joined_df=utils.read_table( # str(joined_df_file_name), # sep = "\t", # header = True, # **{'stringsAsFactors': False}, # **{'check.names': False} # ) na_count=0 R_blast_vlaue_list=base.list() # client = Client(processes=False) generate_row_list_calls=[] extract_gene_gff_calls=[] # for i in range(1,(int(base.nrow(ortholog_joined_df)[0])+1)): # df_row=ortholog_joined_df.rx(i, True) # df_row_iter=iter(df_row[2:]) # head_id=df_row.rx(1,2)[0] # if re.search(",",head_id) is not None: # continue # all_row_gene_fasta_file=all_row_gene_fasta_dir / (head_id+"_"+"_all_row_gene_fasta.fasta") # generate_row_list_calls.append(dask.delayed(generate_row_list)( # df_row_iter, # head_id, # all_row_gene_list_dir, # all_row_gene_list_dir_all # )) # extract_gene_gff_calls.append(dask.delayed(extract_gene_gff)( # all_row_gene_list_dir_all/(head_id+".txt"), # strain_db_dir_path, # contig_path, # all_row_gene_fasta_dir # )) # dask.compute(*generate_row_list_calls) # dask.compute(*extract_gene_gff_calls) # Parallel(n_jobs=12)(delayed(generate_row_list)( # ortholog_joined_df.rx(i, True), # all_row_gene_list_dir, # all_row_gene_list_dir_all # ) for i in range(1,(int(base.nrow(ortholog_joined_df)[0])+1))) # Parallel(n_jobs=12)(delayed(extract_gene_gff)(all_row_gene_list_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for all_row_gene_list_file in all_row_gene_list_dir_all.iterdir()) output_list=[] lineList=[] rdocall=base.do_call rread_table=utils.read_table rwrite_table=utils.write_table rlist=base.list rlength=base.length only_itself_list=[] length_0_list=[] no_mgg_list=[] # for all_row_gene_fasta_file in all_row_gene_fasta_dir.glob("MGG_05204T0*"): for all_row_gene_fasta_file in all_row_gene_fasta_dir.iterdir(): R_blast_vlaue_df=dask.delayed(ortholog_blast)( # R_blast_vlaue_df=ortholog_blast( all_row_gene_fasta_file, # all_row_gene_fasta_dir, blast_identity_value_dir, only_itself_list, length_0_list, no_mgg_list, rread_table, rwrite_table, blast_db_path, blast_out_xml_path, blast_out_asn_path, blast_out_txt_path, rlist, rlength, rdocall ) output_list.append(R_blast_vlaue_df) total=dask.delayed(merge_df)(output_list,base.list,utils.write_table,base.do_call,general_out_path) # total.visualize() total.compute()
from set_minus import drawer_curve, set_minus from set_minus_orthofinder import R_set_minus_cut_orthofinder from copy_contig import Copy_contig from Directory_creater import directory_creater from annotation_secreted_proteins_2 import drawer_secreted_protein, secreted_protein # annotation_secreted_proteins_2是没有mRNA_protein_mapping_table_file_name的 from fgenesh_result_to_fasta_pro import fgenesh_result_to_fasta from CD_HIT import cd_hit from fgenesh_GFF import writen_name from extract_sequence_from_Orthogroups import extract_sequence_from_Orthogroups ''' copy ''' contig_path = Path("../../contig/") MGG_70_15_contig = "../../70-15_refference_genome/70-15_supercontigs.fasta" ina168_contig = "../../contig/ina168.fasta" general_out = directory_creater("../Pan_genome_data_2/") copy_std_out_err = directory_creater(general_out / "copy_out_err") contig_156_path = directory_creater(general_out / "156_contig") # Copy(contig_path,MGG_70_15_contig,ina168_contig,contig_156_path,copy_std_out_err) ''' phase2:分析预测基因结果 70-15需要跳过,从molquest结果里面删掉,已删掉 ina168需要单独处理,已处理 手动把70-15放入protein_base和mRNA_base FR13,gene,protein, 手动全部替换,把genome换成FR13,文件名也换成FR13 唯三需要注意的: 70-15 ina168 FR13 '''
def prepare_for_ParaAT(joined_df_file_name, id_file, out_dir): ''' input 1:joined_df_file_name input 2:id_file output 1:out_dir ''' global coding_gene_base, protein_base, gene_fasta_fl, protein_fasta_fl gene_dir_path = directory_creater(out_dir / "nucleotide") protein_dir_path = directory_creater(out_dir / "aminoacid") fasta_base_dir_path = directory_creater(out_dir / "gene_protein_base") homolog_dir_path = directory_creater(out_dir / "homolog") protein_base_file_path = fasta_base_dir_path / "protein_base.fasta" coding_gene_base_file_path = fasta_base_dir_path / "coding_gene_base.fasta" if coding_gene_base_file_path.is_file() is False: coding_gene_base_list = [ 'cat', '/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/70-15_refference_genome/magnaporthe_oryzae_70-15_8_transcripts.fasta', "/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/wangzhe2/New_add_ina168/ina168_CDS.fasta" ] protein_base_list = [ 'cat', '/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/70-15_refference_genome/magnaporthe_oryzae_70-15_8_proteins_T0.fasta', "/mnt/d/zhes_learning_space/the_assignment/pan_genome/Mag_genomes/wangzhe2/New_add_ina168/ina168_protein.fasta" ] merge_to_one(coding_gene_base_list, id_file, "_CDS.fasta", fasta_base_dir_path / "coding_gene_list.txt", coding_gene_base_file_path, fasta_base_dir_path / "coding_gene_cat_err.txt") merge_to_one(protein_base_list, id_file, "_protein.fasta", fasta_base_dir_path / "protein_list.txt", protein_base_file_path, fasta_base_dir_path / "protein_cat_err.txt") coding_gene_base = SeqIO.index(str(coding_gene_base_file_path), "fasta", key_function=get_id_protein) protein_base = SeqIO.index(str(protein_base_file_path), "fasta", key_function=get_id_protein) base = importr("base") utils = importr("utils") ortholog_joined_df = utils.read_table(joined_df_file_name, sep="\t", header=True, **{'stringsAsFactors': False}, **{'check.names': False}) ortholog_joined_df_sub = ortholog_joined_df.rx(True, -1) for i in range(1, (int(base.nrow(ortholog_joined_df)[0]) + 1)): df_row = ortholog_joined_df_sub.rx(i, True) df_row_iter = iter(df_row) head_list = next(df_row_iter)[0].split() if len(head_list) == 1: gene_fasta = gene_dir_path / (head_list[0] + ".fasta") protein_fasta = protein_dir_path / (head_list[0] + ".fasta") homolog_file_path = homolog_dir_path / (head_list[0] + ".txt") if gene_fasta.is_file() is True: continue with gene_fasta.open('w') as gene_fasta_fl: with protein_fasta.open('w') as protein_fasta_fl: with homolog_file_path.open('w') as homolog_fl: extract_gene(head_list[0]) homolog_fl.write(head_list[0] + "\t") for homolog_id in one_head(df_row_iter): homolog_fl.write(homolog_id + "\t") homolog_fl.write("\n") else: two_head(head_list, df_row_iter)
def run_paraAT(in_dir): ParaAT_stdout=directory_creater(in_dir/"ParaAT_stdout") ParaAT_stderr=directory_creater(in_dir/"ParaAT_stderr") Parallel(n_jobs=12)(delayed(Popen_paraAT)(i,ParaAT_stdout,ParaAT_stderr) for i in format_paraAT_parameter(in_dir))
def call_snp_mummer(id_file, contig_dir_name, out_dir): ''' input 1: strain 95 file input 2: contig_dir_name output 1: out dir ''' strain_95_list = extract_strain_id(id_file) strain_95_list.append("ina168") strain_95_list.remove("magnaporthe_oryzae_70-15_8_proteins_T0") contig_dir_path = Path(contig_dir_name) intermediate_dir_path = directory_creater(out_dir / "intermediate_files") nucmer_stdout_dir_path = directory_creater(intermediate_dir_path / "nucmer_std_out") nucmer_stderr_dir_path = directory_creater(intermediate_dir_path / "nucmer_std_err") delta_dir_path = directory_creater(intermediate_dir_path / "delta") delta_filter_err_dir_path = directory_creater(intermediate_dir_path / "filter_err") delta_filter_file_dir_path = directory_creater(intermediate_dir_path / "filter") delta_snp_dir_path = directory_creater(intermediate_dir_path / "snp") delta_snp_err_path = directory_creater(intermediate_dir_path / "snp_err") delta_vcf_file_path = directory_creater(intermediate_dir_path / "vcf") delta_vcf_err_path = directory_creater(intermediate_dir_path / "To_vcf_err") merge_std_out_err_dir_path = directory_creater(intermediate_dir_path / "merge_std_out_err") bgzip_out_err_dir_path = directory_creater(intermediate_dir_path / "bgzip") index_out_err_dir_path = directory_creater(intermediate_dir_path / "index") global merge_list merge_list = ["bcftools", "merge"] for strain_95_id in strain_95_list: delta_file_path = delta_dir_path / (strain_95_id + ".delta") if delta_file_path.is_file() is False: nucmer(str(contig_dir_path / (strain_95_id + ".fasta")), str(delta_file_path), nucmer_stdout_dir_path / (strain_95_id + '_stdout.txt'), nucmer_stderr_dir_path / (strain_95_id + '_stderr.txt')) delta_filter_file_path = delta_filter_file_dir_path / (strain_95_id + "_filter.txt") if delta_filter_file_path.is_file() is False: call_filter( delta_file_path, delta_filter_file_path, delta_filter_err_dir_path / (strain_95_id + "_filter_err.txt")) delta_snp_file_path = delta_snp_dir_path / (strain_95_id + "_snp.txt") if delta_snp_file_path.is_file() is False: snp(delta_filter_file_path, delta_snp_file_path, delta_snp_err_path / (strain_95_id + "_snp_err.txt")) delta_snp2vcf_path = delta_vcf_file_path / (strain_95_id + ".vcf") delta_snp2vcf_gz_path = delta_vcf_file_path / (strain_95_id + ".vcf.gz") if delta_snp2vcf_gz_path.is_file() is False: To_vcf( delta_snp_file_path, delta_snp2vcf_path, delta_vcf_err_path / (strain_95_id + "_to_vcf_err.txt"), bgzip_out_err_dir_path / (strain_95_id + "_bgzip_out_err.txt"), index_out_err_dir_path / (strain_95_id + "_index_out_err.txt")) merge_list.append(str(delta_snp2vcf_path) + ".gz") merge_list.extend(["-O", "v", "-o", str(out_dir / "merge.vcf")]) call_merge = subprocess.Popen(merge_list, stdout=(merge_std_out_err_dir_path / "merge_out.txt").open('w'), stderr=subprocess.STDOUT) call_merge.wait()
def extract_ortholog_gene(strain_db_dir_path, contig_path, orthogroup_sequence_path, ortholog_blast_path_name): ''' input 1: strain_db_dir_path input 2: contig_path input 3: orthogroup_sequence_path output 1: ortholog_blast_path_name ''' general_out_path = Path(ortholog_blast_path_name) blast_general = general_out_path / "blast_general" if blast_general.exists() is False: blast_general.mkdir() all_row_gene_dir_path = directory_creater(general_out_path / "all_row_gene") all_row_gene_fasta_dir = directory_creater(all_row_gene_dir_path / "fasta") all_row_gene_list_dir = directory_creater(all_row_gene_dir_path / "list") all_row_gene_list_dir_all = directory_creater(all_row_gene_dir_path / "all_list") blast_db_path = blast_general / "blast_db" if blast_db_path.exists() is False: blast_db_path.mkdir() blast_out_path = blast_general / "blast_out" if blast_out_path.exists() is False: blast_out_path.mkdir() blast_out_xml_path = blast_out_path / "blast_out_xml" if blast_out_xml_path.exists() is False: blast_out_xml_path.mkdir() blast_out_asn_path = blast_out_path / "blast_out_asn" if blast_out_asn_path.exists() is False: blast_out_asn_path.mkdir() blast_out_txt_path = blast_out_path / "blast_out_txt" if blast_out_txt_path.exists() is False: blast_out_txt_path.mkdir() blast_identity_value_dir = blast_general / "blast_identity_value" if blast_identity_value_dir.exists() is False: blast_identity_value_dir.mkdir() base = importr("base") utils = importr("utils") R_blast_vlaue_list = base.list() # Parallel(n_jobs=12)(delayed(generate_row_list)(orthogroup_file,all_row_gene_list_dir_all) for orthogroup_file in orthogroup_sequence_path.iterdir() # Parallel(n_jobs=12)(delayed(extract_gene_gff)(orthogroup_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for orthogroup_file in orthogroup_sequence_path.iterdir()) R_blast_vlaue_list = base.list() i = 1 no_mgg_fl = (general_out_path / "no_mgg.txt").open('w') only_itself_fl = (general_out_path / "only_itself.txt").open('w') length_0_fl = (general_out_path / "length_0.txt").open('w') for all_row_gene_fasta_file in blast_identity_value_dir.iterdir(): R_blast_vlaue_df = ortholog_blast( all_row_gene_fasta_file, blast_identity_value_dir, only_itself_fl, length_0_fl, no_mgg_fl, utils.read_table, utils.write_table, blast_db_path, blast_out_xml_path, blast_out_asn_path, blast_out_txt_path, base) if R_blast_vlaue_df == "NA" or R_blast_vlaue_df.rx2("V1") == "NA": continue R_blast_vlaue_list.rx2[i] = robjects.DataFrame(R_blast_vlaue_df) i = i + 1 R_blast_vlaue_df = base.do_call("rbind", R_blast_vlaue_list) utils.write_table(R_blast_vlaue_df, **{'file': str(general_out_path / "all.txt")}, **{'append': False}, **{'quote': False}, **{'sep': "\t"}, **{'row.names': False}, **{'col.names': True})
# i=i+1 # #检查结果不是all true?multi_copy_gene_Vector中是否有异常值,blast结果中检查只用一个hsp行不行 # # best_blast_value_list=list(R_parse_blast_result.parse_blast_result( # # single_copy_gene_list, # # multi_copy_gene_list, # # R_blast_vlaue_df, # # str(blast_general/(head_id+"_single_err.txt")) # # )) # # extract_gene(best_blast_value_list,all_row_gene_fasta_file,blast_general/(head_id+"_single_copy.fasta")) # R_blast_vlaue_df=base.do_call("rbind",R_blast_vlaue_list) # utils.write_table(R_blast_vlaue_df,**{'file': str(general_out_path/"all.txt")},**{'append': False},**{'quote': False},**{'sep': "\t"},**{'row.names': False},**{'col.names': True}) if __name__ == '__main__': # client = Client(n_workers=12, threads_per_worker=1) cluster = PBSCluster( job_extra=["-l nodes=1:ppn=24","-l mem=5000MB"], header_skip=["select"], processes=24, walltime='25:00:00' ) cluster.scale(jobs=10) from dask.distributed import Client client = Client(cluster) print(cluster.job_script()) extract_ortholog_gene( Path("../Pan_genome_data_2/contig_gff3_gffutils_db"), Path("../Pan_genome_data_2/156_contig"), Path("../Pan_genome_data_2/ortho/joined_df.tsv"), directory_creater("/gpfshome/home/Baojd/wangzhe/ortholog_blast_mgg_key"), )
def extract_ortholog_gene(strain_db_dir_path, contig_path, joined_df_file_name, ortholog_blast_path_name): ''' input 1: strain_db_dir_path input 2: contig_path input 3: joined_df_file_name output 1: ortholog_blast_path_name ''' sequence_out_dir = directory_creater(ortholog_blast_path_name / "sequence_out") gff_out_path = directory_creater(sequence_out_dir / "gene_slop_1K_gff") global fst_sequence_slop_1K_dir fst_sequence_slop_1K_dir = directory_creater(sequence_out_dir / "fst_sequence_slop_1K") global protein_id_vs_chrom_dir protein_id_vs_chrom_dir = directory_creater(sequence_out_dir / "protein_id_vs_chrom") blast_path = Path(ortholog_blast_path_name) global blast_general, blast_per_MGG_gene, blast_body_sequence_path, blast_head_sequence_path, blast_db_path, blast_out_path, blast_out_xml_path, blast_out_txt_path, blast_sequence, fst_sequence, blast_result, blast_out_asn_path sequence_out_dir = directory_creater(blast_path.parent / "sequence_out") blast_general = blast_path / "blast_general" if blast_general.exists() is False: blast_general.mkdir() blast_per_MGG_gene = blast_path / "blast_per_MGG_gene" if blast_per_MGG_gene.exists() is False: blast_per_MGG_gene.mkdir() blast_body_sequence_path = blast_path / "blast_body_sequence" if blast_body_sequence_path.exists() is False: blast_body_sequence_path.mkdir() blast_head_sequence_path = blast_path / "blast_head_sequence" if blast_head_sequence_path.exists() is False: blast_head_sequence_path.mkdir() blast_db_path = blast_path / "blast_db" if blast_db_path.exists() is False: blast_db_path.mkdir() blast_out_path = blast_path / "blast_out" if blast_out_path.exists() is False: blast_out_path.mkdir() blast_out_xml_path = blast_out_path / "blast_out_xml" if blast_out_xml_path.exists() is False: blast_out_xml_path.mkdir() blast_out_asn_path = blast_out_path / "blast_out_asn" if blast_out_asn_path.exists() is False: blast_out_asn_path.mkdir() blast_out_txt_path = blast_out_path / "blast_out_txt" if blast_out_txt_path.exists() is False: blast_out_txt_path.mkdir() blast_sequence = blast_path / "blast_sequence" if blast_sequence.exists() is False: blast_sequence.mkdir() fst_sequence = sequence_out_dir / "fst_sequence" if fst_sequence.exists() is False: fst_sequence.mkdir() global fst_sequence_two_head fst_sequence_two_head = sequence_out_dir / "fst_sequence_two_head" if fst_sequence_two_head.exists() is False: fst_sequence_two_head.mkdir() blast_result = blast_path / "blast_result" if blast_result.exists() is False: blast_result.mkdir() global blast_exception_path_name blast_exception_path_name = directory_creater(blast_path / "blast_exception") global gffutils_db_dir_path_name gffutils_db_dir_path_name = directory_creater(blast_path.parent / "gffutils_db") # global json_dir_path_name # json_dir_path_name=Path("../Pan_genome_data/contig_length_json/") global contig_dir_path_name contig_dir_path_name = Path("../../contig/") global gff_path_name gff_path_name = Path("../../GFF/") global_names = globals() # for json_file in json_dir_path_name.iterdir(): # with json_file.open() as json_fl: # global_names[json_file.stem+"_json"]=json.load(json_fl) # for strain_id in species_95_list: # for gff_file in gff_path_name.glob(strain_id+".gff"): # gff_db_name=gffutils_db_dir_path_name/(strain_id+".db") # if gff_db_name.is_file() is False: # gffutils.create_db(str(gff_file),str(gff_db_name),force=True,id_spec=None) # global_names[strain_id+"_db"]=gffutils.FeatureDB(gff_db_name) MGG_db_file_path = gffutils_db_dir_path_name / "MGGdb.db" if MGG_db_file_path.is_file() is False: gffutils.create_db( "../../70-15_refference_genome/70-15_Gff/magnaporthe_oryzae_70-15_8_genome_summary_per_gene_amend.txt", str(MGG_db_file_path), id_spec=':source:') global MGG_db MGG_db = gffutils.FeatureDB(MGG_db_file_path) # global strain_protein_id_pattern # strain_protein_id_pattern=re.compile("(.+)_protein_(.+)_") base = importr("base") utils = importr("utils") ortholog_joined_df = utils.read_table(str(joined_df_file_name), sep="\t", header=True, **{'stringsAsFactors': False}, **{'check.names': False}) blast_general_same_strain_list = blast_general / "same_strain_list.txt" blast_general_value_list = blast_general / "value_list.txt" global blast_general_same_strain_list_fl, blast_general_value_list_fl ortholog_joined_df_sub = ortholog_joined_df.rx(True, -1) with blast_general_same_strain_list.open( 'w+') as blast_general_same_strain_list_fl: with blast_general_value_list.open( 'w+') as blast_general_value_list_fl: for i in range(1, (int(base.nrow(ortholog_joined_df)[0]) + 1)): df_row = ortholog_joined_df_sub.rx(i, True) df_row_iter = iter(df_row) one2one(df_row_iter)
def extract_ortholog_gene(strain_db_dir_path, contig_path, orthogroup_sequence_path, pav_file_path, filter_lower, ortholog_blast_path_name): ''' input 1: strain_db_dir_path input 2: contig_path input 3: orthogroup_sequence_path input 4: pav_file_path input 5: filter_lower output 1: ortholog_blast_path_name ''' general_out_path = Path(ortholog_blast_path_name) blast_general = general_out_path / "blast_general" if blast_general.exists() is False: blast_general.mkdir() all_row_gene_dir_path = directory_creater(general_out_path / "all_row_gene") all_row_gene_fasta_dir = directory_creater(all_row_gene_dir_path / "fasta") all_row_gene_list_dir = directory_creater(all_row_gene_dir_path / "list") all_row_gene_list_dir_all = directory_creater(all_row_gene_dir_path / "all_list") blast_db_path = blast_general / "blast_db" if blast_db_path.exists() is False: blast_db_path.mkdir() blast_out_path = blast_general / "blast_out" if blast_out_path.exists() is False: blast_out_path.mkdir() blast_out_xml_path = blast_out_path / "blast_out_xml" if blast_out_xml_path.exists() is False: blast_out_xml_path.mkdir() blast_out_asn_path = blast_out_path / "blast_out_asn" if blast_out_asn_path.exists() is False: blast_out_asn_path.mkdir() blast_out_txt_path = blast_out_path / "blast_out_txt" if blast_out_txt_path.exists() is False: blast_out_txt_path.mkdir() blast_identity_value_dir = blast_general / "blast_identity_value" if blast_identity_value_dir.exists() is False: blast_identity_value_dir.mkdir() base = importr("base") utils = importr("utils") # Parallel(n_jobs=12)(delayed(generate_row_list)(orthogroup_file,all_row_gene_list_dir_all) for orthogroup_file in orthogroup_sequence_path.iterdir() # Parallel(n_jobs=12)(delayed(extract_gene_gff)(orthogroup_file,strain_db_dir_path,contig_path,all_row_gene_fasta_dir) for orthogroup_file in orthogroup_sequence_path.iterdir()) no_mgg_fl = (general_out_path / "no_mgg.txt").open('w') only_itself_fl = (general_out_path / "only_itself.txt").open('w') length_0_fl = (general_out_path / "length_0.txt").open('w') filter_lower_list_file = general_out_path / ("filter_" + str(filter_lower) + ".txt") if filter_lower_list_file.exists() is False: R_filter_strain_num.filter_strain_num(str(pav_file_path), filter_lower, str(filter_lower_list_file)) # cluster = LocalCluster() # client = Client(cluster) calls = [] with filter_lower_list_file.open() as filter_lower_list_fl: # for all_row_gene_fasta_file in filter_lower_list_fl: # calls.append(dask.delayed(ortholog_blast)( # all_row_gene_fasta_file, # blast_identity_value_dir, # only_itself_fl, # length_0_fl, # no_mgg_fl, # utils.read_table, # utils.write_table, # blast_db_path, # blast_out_xml_path, # blast_out_asn_path, # blast_out_txt_path, # base # )) # dask.compute(*calls) for all_row_gene_fasta_file in filter_lower_list_fl: ortholog_blast(all_row_gene_fasta_file.strip(), all_row_gene_fasta_dir, blast_identity_value_dir, only_itself_fl, length_0_fl, no_mgg_fl, utils.read_table, utils.write_table, blast_db_path, blast_out_xml_path, blast_out_asn_path, blast_out_txt_path, base.list, base.length, base.do_call)
# # break # extract_gene_gff(all_row_gene_list,strain_db_dir_path,contig_path,all_row_gene_fasta_file) # R_blast_vlaue_df=ortholog_blast( # head_id, # all_row_gene_fasta_file, # blast_identity_value_dir/(head_id+".tsv"), # utils.read_table, # utils.write_table, # blast_db_path, # blast_out_xml_path, # blast_out_asn_path, # blast_out_txt_path, # base # ) # R_blast_vlaue_list.rx2[i]=robjects.DataFrame(R_blast_vlaue_df) # i=i+1 # #检查结果不是all true?multi_copy_gene_Vector中是否有异常值,blast结果中检查只用一个hsp行不行 # # best_blast_value_list=list(R_parse_blast_result.parse_blast_result( # # single_copy_gene_list, # # multi_copy_gene_list, # # R_blast_vlaue_df, # # str(blast_general/(head_id+"_single_err.txt")) # # )) # # extract_gene(best_blast_value_list,all_row_gene_fasta_file,blast_general/(head_id+"_single_copy.fasta")) # R_blast_vlaue_df=base.do_call("rbind",R_blast_vlaue_list) # utils.write_table(R_blast_vlaue_df,**{'file': str(general_out_path/"all.txt")},**{'append': False},**{'quote': False},**{'sep': "\t"},**{'row.names': False},**{'col.names': True}) extract_ortholog_gene( directory_creater("/gpfshome/home/Baojd/wangzhe/ortho_blast"))