-
Notifications
You must be signed in to change notification settings - Fork 0
/
What_in_LCB.py
145 lines (137 loc) · 6.75 KB
/
What_in_LCB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import argparse
import pandas as pd
import os
import sys
from os.path import split
from pyteomics.fasta import read
parser = argparse.ArgumentParser(description='What in SBs')
parser.add_argument('-sbs',
type=str,
help='SB',
required=True
)
parser.add_argument('-antn',
type=str,
help='Annotation',
required=True
)
parser.add_argument('-orgs',
type=str,
help='References of organisms',
default=None
)
parser.add_argument('-save_way',
type=str,
help='Save way of annotation',
required=True
)
args = parser.parse_args()
SBs = os.listdir(args.sbs)
coord_prot = os.listdir(args.antn)
orgsnisms = os.listdir(args.orgs)
part = ''
condition = ''
coordinate_of_split = ''
check_list = {}
#Maiking spike
for org in orgsnisms:
if '.DS_Store' in org:
continue
strains = os.listdir('{}{}/All/'.format(args.orgs, org))
for strain in strains:
if '.DS_Store' in strain:
continue
check_list[strain[: -12]] = {}
fna = read('{}{}/All/{}'.format(args.orgs, org, strain))
counter = 0
index = 0
for line in fna:
counter += len(line.sequence)
if index != 0:
check_list[strain[: -12]][line.description.split()[0]] = counter
else:
check_list[strain[: -12]][line.description.split()[0]] = 0
index += 1
for organism in SBs:
if '.DS_store' in organism:
continue
for an_org in coord_prot:
if '.DS_Store' in an_org:
continue
if '.txt' not in an_org:
continue
if an_org[11: -4] in organism:
print('Analyzing of {} ...'.format(an_org[11: -4]))
save_file = open(args.save_way + '{}.txt'.format(an_org[11: -4]), 'w')
save_file.write('Strain\tSB\tCoord_of_SB\tProtein\tCoord_of_prot\tPart\tCondition\n')
SB_table = pd.read_csv(args.sbs + organism,
header=None,
names=('SB', 'Strain', 'Coordinates'),
sep='\t'
)
coord = pd.read_csv(args.antn + an_org,
header=None,
names=('Strain', 'Contig', 'Coordinates', 'Protein', 'Part'),
sep='\t'
)
for SB_ind in range(len(SB_table)):
for ind in range(len(coord)):
if SB_table.Strain[SB_ind] == coord.Strain[ind]:
for strain in check_list:
if strain == SB_table.Strain[SB_ind]:
for key in check_list[strain]:
if coord.Contig[ind] in key:
coef = check_list[strain][key]
#print('Analyzing of {} ...'.format(line.description.split()[1]))
start_SB = int(SB_table.Coordinates[SB_ind].split(':')[0][1:])
end_SB = int(SB_table.Coordinates[SB_ind].split(':')[1][: -1])
start_prot = int(coord.Coordinates[ind].split(',')[0][1:])
end_prot = int(coord.Coordinates[ind].split(',')[1][: -1])
if coord.Part[ind] == 'partI':
part = 'partI'
elif coord.Part[ind] == 'partII':
part = 'partII'
else:
part = 'full'
if start_SB < start_prot and end_SB > end_prot:
condition = 'full protein'
save_file.write('{Strain}\t{SB}\t{Start_SB}:{End_SB}\t{Protein}\t{Start_Prot}:{End_Prot}\t{Part}\t{Condition}\n'.format(
Strain = SB_table.Strain[SB_ind],
SB = SB_table.SB[SB_ind],
Start_SB = start_SB,
End_SB = end_SB,
Protein = coord.Protein[ind],
Start_Prot = start_prot,
End_Prot = end_prot,
Part = part,
Condition = condition
))
elif start_SB > start_prot and start_SB < end_prot and end_SB > end_prot:
condition = 'front end'
save_file.write('{Strain}\t{SB}\t{Start_SB}:{End_SB}\t{Protein}\t{Start_Prot}:{End_Prot}\t{Part}\t{Condition}\n'.format(
Strain = SB_table.Strain[SB_ind],
SB = SB_table.SB[SB_ind],
Start_SB = start_SB,
End_SB = end_SB,
Protein = coord.Protein[ind],
Start_Prot = start_prot,
End_Prot = end_prot,
Part = part,
Condition = condition
))
elif start_SB < start_prot and end_SB < end_prot and start_prot < end_SB:
condition = 'back end'
save_file.write('{Strain}\t{SB}\t{Start_SB}:{End_SB}\t{Protein}\t{Start_Prot}:{End_Prot}\t{Part}\t{Condition}\n'.format(
Strain = SB_table.Strain[SB_ind],
SB = SB_table.SB[SB_ind],
Start_SB = start_SB,
End_SB = end_SB,
Protein = coord.Protein[ind],
Start_Prot = start_prot,
End_Prot = end_prot,
Part = part,
Condition = condition
))
save_file.close()
print('Analizing of {} was done!'.format(an_org[11: -4]))
print('Done!')