/
blast_70_15_mgg_Augustus.py
110 lines (106 loc) · 5.56 KB
/
blast_70_15_mgg_Augustus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast.Applications import NcbiblastformatterCommandline
from Bio.Blast.Applications import NcbimakeblastdbCommandline
from Bio import SeqIO
from Bio.Blast import NCBIXML
from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage
def blastdb (in_file,db_file):
make_db_cmd=NcbimakeblastdbCommandline(
cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.0+/bin/makeblastdb',
dbtype='nucl',
input_file=in_file,
out=db_file
)
make_db_cmd()
def blast(db_file,query_file,blast_out_asn_file,blast_out_xml_file,blast_out_txt_file):
blast_asn_cmd=NcbiblastnCommandline(
cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.0+/bin/blastn',
query=query_file,
db=db_file,
outfmt=11,
out=blast_out_asn_file
)
blast_asn_cmd()
blast_xml_cmd=NcbiblastformatterCommandline(
archive=blast_out_asn_file,
outfmt=5,
out=blast_out_xml_file,
cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.0+/bin/blast_formatter'
)
blast_xml_cmd()
blast_txt_cmd=NcbiblastformatterCommandline(
archive=blast_out_asn_file,
outfmt=7,
out=blast_out_txt_file,
cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.10.0+/bin/blast_formatter'
)
blast_txt_cmd()
def read_blast_result(blast_out_xml_file,query_file,MGG_unpresent_Augustus):
hits = []
with open(MGG_unpresent_Augustus,'w+') as out_fl:
for record in NCBIXML.parse(open(blast_out_xml_file)):
if record.alignments:
hits.append(record.query.split()[0])
else:
out_fl.write("{}\n".format(record.query.split()[0]))
query_dict = SeqIO.index(query_file, "fasta")
set_1=set(query_dict.keys())
set_2=set(hits)
misses = set_1 - set_2
print(len(misses))
def MGG_unpresent_Augustus_locate_in_pav_orthofinder(
MGG_unpresent_Augustus_list_file_name,
pav_orthofinder_file_name,
gene_protein_mapping_table_file_name,
pav_MGG_unpresent_Augustus_file_name,
orthofinder_unassianed_tsv_file_name,
MGG_unpresent_Augustus_unassianed_list_file_name
):
R_code='''
MGG_unpresent_Augustus_locate_in_pav_orthofinder=function(MGG_unpresent_Augustus_list_file_name,pav_orthofinder_file_name,gene_protein_mapping_table_file_name,pav_MGG_unpresent_Augustus_file_name,orthofinder_unassianed_tsv_file_name,MGG_unpresent_Augustus_unassianed_list_file_name){
require(readxl)
require(WriteXLS)
require(dplyr)
MGG_unpresent_Augustus_list=read.table(MGG_unpresent_Augustus_list_file_name,stringsAsFactors = F)
pav_orthofinde=read_xlsx(pav_orthofinder_file_name)
pav_orthofinde=pav_orthofinde[,-1]
gene_protein_mapping_table=read.table(gene_protein_mapping_table_file_name)
pav_orthofinde_protein=merge(MGG_unpresent_Augustus_list,gene_protein_mapping_table,by.x = 1,by.y = 1)
pav_MGG_unpresent_Augustus =pav_orthofinde %>%
filter(protein_id %in% pav_orthofinde_protein$V2)
pav_MGG_unpresent_Augustus=pav_MGG_unpresent_Augustus[,c(158,1:157)]
WriteXLS::WriteXLS(pav_MGG_unpresent_Augustus,pav_MGG_unpresent_Augustus_file_name)
orthofinder_unassianed=read.table(orthofinder_unassianed_tsv_file_name,sep = "\t",header = T,check.names = F)
MGG_unpresent_Augustus_unassianed_list=intersect(pav_MGG_unpresent_Augustus$protein_id,orthofinder_unassianed$`70-15_protein`)
write.table(MGG_unpresent_Augustus_unassianed_list,MGG_unpresent_Augustus_unassianed_list_file_name,quote = F,row.names = F,col.names = F)
}
'''
R_MGG_unpresent_Augustus_locate_in_pav_orthofinder = SignatureTranslatedAnonymousPackage(R_code, "R_MGG_unpresent_Augustus_locate_in_pav_orthofinder")
R_MGG_unpresent_Augustus_locate_in_pav_orthofinder.MGG_unpresent_Augustus_locate_in_pav_orthofinder(MGG_unpresent_Augustus_list_file_name,pav_orthofinder_file_name,gene_protein_mapping_table_file_name,pav_MGG_unpresent_Augustus_file_name,orthofinder_unassianed_tsv_file_name,MGG_unpresent_Augustus_unassianed_list_file_name)
def remove_MGG_unpresent_Augustus(pav_orthofinder,MGG_unpresent_Augustus_unassianed_list_file_name,pav_orthofinder_1574):
'''
用于从pav_orthofinder中删掉1574个出现在unassigned gene中的
input 1: pav_orthofinder
input 2: MGG_unpresent_Augustus_unassianed_list
output 1: pav_orthofinder_1574
'''
R_code_remove_MGG_unpresent_Augustus='''
R_remove_MGG_unpresent_Augustus=function(
pav_orthofinder_file_name,
MGG_unpresent_Augustus_unassianed_list_file_name,
pav_orthofinder_1574_file_name
){
require(readxl)
require(WriteXLS)
require(dplyr)
MGG_unpresent_Augustus_unassianed_list=read.table(MGG_unpresent_Augustus_unassianed_list_file_name)
pav_orthofinder=read_xlsx(pav_orthofinder_file_name)
pav_orthofinder_1574=pav_orthofinder %>%
filter(!(protein_id %in% MGG_unpresent_Augustus_unassianed_list$V1))
pav_orthofinder_1574=pav_orthofinder_1574[,-1]
pav_orthofinder_1574=pav_orthofinder_1574[,c(1,158,2:157)]
WriteXLS::WriteXLS(pav_orthofinder_1574,pav_orthofinder_1574_file_name)
}
'''
R_remove_MGG_unpresent_Augustus = SignatureTranslatedAnonymousPackage(R_code_remove_MGG_unpresent_Augustus, "R_remove_MGG_unpresent_Augustus")
R_remove_MGG_unpresent_Augustus.R_remove_MGG_unpresent_Augustus(str(pav_orthofinder),str(MGG_unpresent_Augustus_unassianed_list_file_name),str(pav_orthofinder_1574))