def generate(X, seqType, args): ''' # Reference: It is a very common feature. :param X: :param seqType: :param args: :return: ''' if seqType == 'DNA': d = { 'A': [0, 5, 5, 1], 'C': [5, 1, 0, 5], 'G': [1, 5, 5, 0], 'T': [5, 0, 1, 5], 'p': [0, 0, 0, 0], # padding } else: if seqType == 'RNA': d = { 'A': [0, 5, 5, 1], 'C': [5, 1, 0, 5], 'G': [1, 5, 5, 0], 'U': [5, 0, 1, 5], 'p': [0, 0, 0, 0], # padding } else: if seqType == 'PROT': print(CRED + 'Error: The \'Transition-Transversion\' feature is NOT applicable for PROT.' + CEND) return None else: None #end-if # print(X) X = utils.processMono(X, d, args) # print(X.shape) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': totalFeature = 4 else: if seqType == 'PROT': None else: None # end-if save.datasetSave(X, totalFeature, 'tt') #end-def
def generate(X, seqType, args): ''' # Reference-1: (http://rosalind.info/glossary/k-mer-composition/) # It is also called "k-mer composition". # Reference-2: iRecSpot-EF: https://www.sciencedirect.com/science/article/abs/pii/S0010482518302981 :param X: :param seqType: :param args: :return: ''' elements = utils.sequenceElements(seqType) # print(elements) # print(args.gGap) # print(args.kTuple) T = [] for x in X: x = x[:args.terminusLength] t = [] for i in range(1, args.kTuple + 1, 1): v = list(itertools.product(elements, repeat=i)) # seqLength = len(x) - i + 1 for i in v: # print(x.count(''.join(i)), end=',') t.append(x.count(''.join(i))) ### --- ### t = np.array(t) # t = t.reshape(-1, 1) # print(t.shape) T.append(t) #end-for T = np.array(T) # print(T.shape) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': totalFeature = np.sum([4**(i) for i in range(1, args.kTuple + 1)]) else: if seqType == 'PROT': totalFeature = np.sum([20**(i) for i in range(1, args.kTuple + 1)]) else: None #end-if save.datasetSave(T, totalFeature, 'fkmer')
def generate(X, seqType, args): ''' # Reference-1: http://rosalind.info/glossary/blosum62/ # Reference-2: https://www.pnas.org/content/89/22/10915/ :param X: :param seqType: :param args: :return: ''' if seqType == 'PROT': d = { 'A': [ 4, 0, -2, -1, -2, 0, -2, -1, -1, -1, -1, -2, -1, -1, -1, 1, 0, 0, -3, -2 ], 'C': [ 0, 9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2 ], 'D': [ -2, -3, 6, 2, -3, -1, -1, -3, -1, -4, -3, 1, -1, 0, -2, 0, -1, -3, -4, -3 ], 'E': [ -1, -4, 2, 5, -3, -2, 0, -3, 1, -3, -2, 0, -1, 2, 0, 0, -1, -2, -3, -2 ], 'F': [ -2, -2, -3, -3, 6, -3, -1, 0, -3, 0, 0, -3, -4, -3, -3, -2, -2, -1, 1, 3 ], 'G': [ 0, -3, -1, -2, -3, 6, -2, -4, -2, -4, -3, 0, -2, -2, -2, 0, -2, -3, -2, -3 ], 'H': [ -2, -3, -1, 0, -1, -2, 8, -3, -1, -3, -2, 1, -2, 0, 0, -1, -2, -3, -2, 2 ], 'I': [ -1, -1, -3, -3, 0, -4, -3, 4, -3, 2, 1, -3, -3, -3, -3, -2, -1, 3, -3, -1 ], 'K': [ -1, -3, -1, 1, -3, -2, -1, -3, 5, -2, -1, 0, -1, 1, 2, 0, -1, -2, -3, -2 ], 'L': [ -1, -1, -4, -3, 0, -4, -3, 2, -2, 4, 2, -3, -3, -2, -2, -2, -1, 1, -2, -1 ], 'M': [ -1, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5, -2, -2, 0, -1, -1, -1, 1, -1, -1 ], 'N': [ -2, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6, -2, 0, 0, 1, 0, -3, -4, -2 ], 'P': [ -1, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7, -1, -2, -1, -1, -2, -4, -3 ], 'Q': [ -1, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5, 1, 0, -1, -2, -2, -1 ], 'R': [ -1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5, -1, -1, -3, -3, -2 ], 'S': [ 1, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4, 1, -2, -3, -2 ], 'T': [ 0, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5, 0, -2, -2 ], 'V': [ 0, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4, -3, -1 ], 'W': [ -3, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11, 2 ], 'Y': [ -2, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, 7 ], 'p': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], } else: if seqType == 'DNA' or seqType == 'RNA': print( CRED + 'Error: The \'BLOSUM62\' feature is NOT applicable for DNA/RNA.' + CEND) return None else: return None #end-if X = utils.processMono(X, d, args) # print(X.shape) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': None else: if seqType == 'PROT': totalFeature = 20 else: None # end-if save.datasetSave(X, totalFeature, 'blosum62')
def generate(X, seqType, args): ''' :param X: :param seqType: :param args: :return: ''' if seqType == 'DNA' or seqType == 'RNA': p = [0] * (4 * 4) # As we are working for g11 else: if seqType == 'PROT': p = [0] * (20 * 20) # As we are working for g11 else: None # Trail: Merged elements = utils.sequenceElements(seqType) m = list(itertools.product(elements, repeat=2)) T = [] for x in X: merged = [] x = x[:args.terminusLength] for i in range(1, args.gGap + 1): kmers = utils.kmers(x, 2 + i) # g11 --> 2, gGap (g11+gGap) t = [] require = (args.terminusLength - (2 + 1) + 1) - (len(x) - (2 + i) + 1) for kmer in kmers: d = {''.join(_): 0 for _ in m} segment = kmer[0] + kmer[-1] d[segment] = 1 t.append(list(d.values())) # break # break # print(v) if require > 0: for i in range(require): t.append(p) # end-for else: None t = np.array(t) # print(t) merged.append(t) # print('------------------') # end-for T.append(np.concatenate((merged), axis=1)) # end-for T = np.array(T) # print(T.shape) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': totalFeature = (4 * args.gGap * 4) else: if seqType == 'PROT': totalFeature = (20 * args.gGap * 20) else: None # end-if save.datasetSave(T, totalFeature, 'pg11') #end-for
def generate(X, seqType, args): ''' # Reference: https://www.biorxiv.org/content/10.1101/332171v2.full.pdf (Supp: Table-5) :param X: :param seqType: :param args: :return: ''' # Column1 --> Acidic : D, E # Column2 --> Basic : R, H, K # Column3 --> Aromatic side chain : Y, F, W # Column4 --> Aliphatic side chain: I, L, V, A, G # Column5 --> Cyclic : P # Column6 --> Sulfur containing : M, C # Column7 --> Hydroxyl containing : S, T # Column8 --> Acidic amide : Q, N # [1, 0, 0, 0, 0, 0, 0, 0], #Acidic # [0, 1, 0, 0, 0, 0, 0, 0], #Basic # [0, 0, 1, 0, 0, 0, 0, 0], #Aromatic side chain # [0, 0, 0, 1, 0, 0, 0, 0], #Aliphatic side chain # [0, 0, 0, 0, 1, 0, 0, 0], #Cyclic # [0, 0, 0, 0, 0, 1, 0, 0], #Sulfur containing # [0, 0, 0, 0, 0, 0, 1, 0], #Hydroxyl containing # [0, 0, 0, 0, 0, 0, 0, 1], #Acidic amide # [0, 0, 0, 0, 0, 0, 0, 0], #padding if seqType == 'PROT': d = { 'A': [0, 0, 0, 1, 0, 0, 0, 0], #Aliphatic side chain 'C': [0, 0, 0, 0, 0, 1, 0, 0], #Sulfur containing 'D': [1, 0, 0, 0, 0, 0, 0, 0], #Acidic 'E': [1, 0, 0, 0, 0, 0, 0, 0], #Acidic 'F': [0, 0, 1, 0, 0, 0, 0, 0], #Aromatic side chain 'G': [0, 0, 0, 1, 0, 0, 0, 0], #Aliphatic side chain 'H': [0, 1, 0, 0, 0, 0, 0, 0], #Basic 'I': [0, 0, 0, 1, 0, 0, 0, 0], #Aliphatic side chain 'K': [0, 1, 0, 0, 0, 0, 0, 0], #Basic 'L': [0, 0, 0, 1, 0, 0, 0, 0], #Aliphatic side chain 'M': [0, 0, 0, 0, 0, 1, 0, 0], #Sulfur containing 'N': [0, 0, 0, 0, 0, 0, 0, 1], #Acidic amide 'P': [0, 0, 0, 0, 1, 0, 0, 0], #Cyclic 'Q': [0, 0, 0, 0, 0, 0, 0, 1], #Acidic amide 'R': [0, 1, 0, 0, 0, 0, 0, 0], #Basic 'S': [0, 0, 0, 0, 0, 0, 1, 0], #Hydroxyl containing 'T': [0, 0, 0, 0, 0, 0, 1, 0], #Hydroxyl containing 'V': [0, 0, 0, 1, 0, 0, 0, 0], #Aliphatic side chain 'W': [0, 0, 1, 0, 0, 0, 0, 0], #Aromatic side chain 'Y': [0, 0, 1, 0, 0, 0, 0, 0], #Aromatic side chain 'p': [0, 0, 0, 0, 0, 0, 0, 0], #padding } else: if seqType == 'DNA' or seqType == 'RNA': print( CRED + 'Error: The \'Physicochemical Properties-P3\' feature is NOT applicable for DNA/RNA.' + CEND) return None else: return None #end-if X = utils.processMono(X, d, args) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': None else: if seqType == 'PROT': totalFeature = 8 else: None # end-if save.datasetSave(X, totalFeature, 'pcpP3')
def generate(X, seqType, args): ''' # Reference: https://doi.org/10.1016/j.omtn.2019.04.025 (It is also called "identity matrix".) :param X: :param seqType: :param args: :return: ''' if seqType == 'DNA': d = { 'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1], 'p': [0, 0, 0, 0], # padding } else: if seqType == 'RNA': d = { 'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'U': [0, 0, 0, 1], 'p': [0, 0, 0, 0], # padding } else: if seqType == 'PROT': # Protein/Peptide One-Zero Encoding d = { 'A': [ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], 'C': [ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], 'D': [ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], 'E': [ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], 'F': [ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], 'G': [ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], 'H': [ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], 'I': [ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], 'K': [ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], 'L': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], 'M': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], 'N': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 ], 'P': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 ], 'Q': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 ], 'R': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 ], 'S': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 ], 'T': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0 ], 'V': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 ], 'W': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0 ], 'Y': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 ], 'p': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], # padding } else: None #end-if # print(X) X = utils.processMono(X, d, args) # print(X.shape) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': totalFeature = 4 else: if seqType == 'PROT': totalFeature = 20 else: None # end-if save.datasetSave(X, totalFeature, 'bpf') #end-def
def generate(X, seqType, args): ''' # Reference: repRNA :param X: :param seqType: :param args: :return: ''' if seqType == 'RNA': d = { 'AA': [ 2.000, 0.000, -6.600, -6.820, -18.400, -19.000, -0.900, -0.930, 0.000, 0.000, 0.023, 0.040, 0.000, 2.000, 3.180, 7.000, -0.080, -1.270, -13.700, 0.000, -0.800, 31.000 ], 'AC': [ 1.000, 1.000, -10.200, -11.400, -26.200, -29.500, -2.100, -2.240, 1.000, 0.000, 0.083, 0.140, 0.000, 1.000, 3.240, 4.800, 0.230, -1.430, -13.800, 0.000, 0.800, 32.000 ], 'AG': [ 1.000, 0.000, -7.600, -10.480, -19.200, -27.100, -1.700, -2.080, 1.000, 1.000, 0.035, 0.080, 0.000, 2.000, 3.300, 8.500, -0.040, -1.500, -14.000, 0.000, 0.500, 30.000 ], 'AU': [ 1.000, 0.000, -5.700, -9.380, -15.500, -26.700, -0.900, -1.100, 0.000, 0.000, 0.090, 0.140, 1.000, 1.000, 3.240, 7.100, -0.060, -1.360, -15.400, 1.000, 1.100, 33.000 ], 'CA': [ 1.000, 1.000, -10.500, -10.440, -27.800, -26.900, -1.800, -2.110, 1.000, 0.000, 0.118, 0.210, 0.000, 1.000, 3.090, 9.900, 0.110, -1.460, -14.400, 0.000, 1.000, 31.000 ], 'CC': [ 0.000, 2.000, -12.200, -13.390, -29.700, -32.700, -2.900, -3.260, 2.000, 0.000, 0.349, 0.490, 0.000, 0.000, 3.320, 8.700, -0.010, -1.780, -11.100, 0.000, 0.300, 32.000 ], 'CG': [ 0.000, 1.000, -8.000, -10.640, -19.400, -26.700, -2.000, -2.360, 2.000, 1.000, 0.193, 0.350, 1.000, 1.000, 3.300, 12.100, 0.300, -1.890, -15.600, 0.000, -0.100, 27.000 ], 'CU': [ 0.000, 1.000, -7.600, -10.480, -19.200, -27.100, -1.700, -2.080, 1.000, 0.000, 0.378, 0.520, 1.000, 0.000, 3.300, 8.500, -0.040, -1.500, -14.000, 1.000, 0.500, 30.000 ], 'GA': [ 1.000, 0.000, -13.300, -12.440, -35.500, -32.500, -2.300, -2.350, 1.000, 1.000, 0.048, 0.100, 1.000, 2.000, 3.380, 9.400, 0.070, -1.700, -14.200, 0.000, 1.300, 32.000 ], 'GC': [ 0.000, 1.000, -14.200, -14.880, -34.900, -36.900, -3.400, -3.420, 2.000, 1.000, 0.146, 0.260, 1.000, 1.000, 3.220, 6.100, 0.070, -1.390, -16.900, 0.000, 0.000, 35.000 ], 'GG': [ 0.000, 0.000, -12.200, -13.390, -29.700, -32.700, -2.900, -3.260, 2.000, 2.000, 0.065, 0.170, 2.000, 2.000, 3.320, 12.100, -0.010, -1.780, -11.100, 0.000, 0.300, 32.000 ], 'GU': [ 0.000, 0.000, -10.200, -11.400, -26.200, -29.500, -2.100, -2.240, 1.000, 1.000, 0.160, 0.270, 2.000, 1.000, 3.240, 4.800, 0.230, -1.430, -13.800, 1.000, 0.800, 32.000 ], 'UA': [ 1.000, 0.000, -8.100, -7.690, -22.600, -20.500, -1.100, -1.330, 0.000, 0.000, 0.112, 0.210, 1.000, 1.000, 3.260, 10.700, -0.020, -1.450, -16.000, 1.000, -0.200, 32.000 ], 'UC': [ 0.000, 1.000, -10.200, -12.440, -26.200, -32.500, -2.100, -2.350, 1.000, 0.000, 0.359, 0.480, 1.000, 0.000, 3.380, 9.400, 0.070, -1.700, -14.200, 1.000, 1.300, 32.000 ], 'UG': [ 0.000, 0.000, -7.600, -10.440, -19.200, -26.900, -1.700, -2.110, 1.000, 1.000, 0.224, 0.340, 1.000, 1.000, 3.090, 9.900, 0.110, -1.460, -14.400, 1.000, 1.000, 31.000 ], 'UU': [ 0.000, 0.000, -6.600, -6.820, -18.400, -19.000, -0.900, -0.930, 0.000, 0.000, 0.389, 0.440, 2.000, 0.000, 3.180, 7.000, -0.080, -1.270, -13.700, 2.000, -0.800, 31.000 ], 'p': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], # padding } else: if seqType == 'PROT' or seqType == 'DNA': print( CRED + 'Error: The \'Physicochemical Properties-R1\' feature is NOT applicable for PROT and DNA.' + CEND) return None else: None #end-if # print(X) X = utils.processDi(X, d, args) # print(X.shape) totalFeature = 0 if seqType == 'RNA': totalFeature = 22 else: if seqType == 'PROT' or seqType == 'DNA': None else: None # end-if save.datasetSave(X, totalFeature, 'pcpR1') #end-def
def generate(X, seqType, args): ''' :param X: :param seqType: :param args: :return: ''' if seqType == 'DNA' or seqType == 'RNA': p = [0] * (4**args.kTuple) else: if seqType == 'PROT': p = [0] * (20**args.kTuple) else: None # print(p) # print(len(p)) elements = utils.sequenceElements(seqType) m = list(itertools.product(elements, repeat=args.kTuple)) terminusLength = args.terminusLength # print(terminusLength) T = [] for x in X: # print(len(x)) x = x[:terminusLength] # print(len(x)) # print('-----------------') require = (terminusLength - args.kTuple + 1) - (len(x) - args.kTuple + 1) # print(require) t = [] kmers = utils.kmers(x, args.kTuple) for kmer in kmers: d = {''.join(i): 0 for i in m} d[kmer] = 1 t.append(list(d.values())) #end-for if require > 0: for i in range(require): t.append(p) #end-for else: None t = np.array(t) # print(t.shape) T.append(t) # print(t.shape) #end-for T = np.array(T) # print(T.shape) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': totalFeature = (4**args.kTuple) else: if seqType == 'PROT': totalFeature = (20**args.kTuple) else: None # end-if save.datasetSave(T, totalFeature, 'pkmer') #end-def
def generate(X, seqType, args): ''' # Reference: https://doi.org/10.1093/bioinformatics/bty451 (Supp: Table S2) :param X: :param seqType: :param args: :return: ''' # Column1 --> Aromatic: {F, Y, W, H} # Column2 --> Negative: {D, E} # Column3 --> Positive: {K, H, R} # Column4 --> Polar: {N, Q, S, D, E, C, T, K, R, H, Y, W} # Column5 --> Hydrophobic: {A, G, C, T, I, V, L, K, H, F, Y, W, M} # Column6 --> Aliphatic: {I, V, L} # Column7 --> Tiny: {A, S, G, C} # Column8 --> Charged: {K, H, R, D, E} # Column9 --> Small: {P, N, D, T, C, A, G, S, V} # Column10 --> Proline: {P} if seqType == 'PROT': d = { 'A': [0, 0, 0, 0, 1, 0, 1, 0, 1, 0], 'R': [0, 0, 1, 1, 0, 0, 0, 1, 0, 0], 'N': [0, 0, 0, 1, 0, 0, 0, 0, 1, 0], 'D': [0, 1, 0, 1, 0, 0, 0, 1, 1, 0], 'C': [0, 0, 0, 1, 1, 0, 1, 0, 1, 0], 'Q': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], 'E': [0, 1, 0, 1, 0, 0, 0, 1, 0, 0], 'G': [0, 0, 0, 0, 1, 0, 1, 0, 1, 0], 'H': [1, 0, 1, 1, 1, 0, 0, 1, 0, 0], 'I': [0, 0, 0, 0, 1, 1, 0, 0, 0, 0], 'L': [0, 0, 0, 0, 1, 1, 0, 0, 0, 0], 'K': [0, 0, 1, 1, 1, 0, 0, 1, 0, 0], 'M': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0], 'F': [1, 0, 0, 0, 1, 0, 0, 0, 0, 0], 'P': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1], 'S': [0, 0, 0, 1, 0, 0, 1, 0, 1, 0], 'T': [0, 0, 0, 1, 1, 0, 0, 0, 1, 0], 'W': [1, 0, 0, 1, 1, 0, 0, 0, 0, 0], 'Y': [1, 0, 0, 1, 1, 0, 0, 0, 0, 0], 'V': [0, 0, 0, 0, 1, 1, 0, 0, 1, 0], 'p': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], } else: if seqType == 'DNA' or seqType == 'RNA': print(CRED+'Error: The \'Physicochemical Properties-P1\' feature is NOT applicable for DNA/RNA.'+CEND) return None else: return None #end-if X = utils.processMono(X, d, args) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': None else: if seqType == 'PROT': totalFeature = 10 else: None # end-if save.datasetSave(X, totalFeature, 'pcpP1')
def generate(X, seqType, args): ''' # Reference-1: http://rosalind.info/glossary/pam250/ # Reference-2: http://profs.scienze.univr.it/~liptak/ALBioinfo/2011_2012/files/pam1.pdf (M.O. Dayhoff et al.) :param X: :param seqType: :param args: :return: ''' if seqType == 'PROT': d = { 'A': [ 2, -2, 0, 0, -3, 1, -1, -1, -1, -2, -1, 0, 1, 0, -2, 1, 1, 0, -6, -3 ], 'C': [ -2, 12, -5, -5, -4, -3, -3, -2, -5, -6, -5, -4, -3, -5, -4, 0, -2, -2, -8, 0 ], 'D': [ 0, -5, 4, 3, -6, 1, 1, -2, 0, -4, -3, 2, -1, 2, -1, 0, 0, -2, -7, -4 ], 'E': [ 0, -5, 3, 4, -5, 0, 1, -2, 0, -3, -2, 1, -1, 2, -1, 0, 0, -2, -7, -4 ], 'F': [ -3, -4, -6, -5, 9, -5, -2, 1, -5, 2, 0, -3, -5, -5, -4, -3, -3, -1, 0, 7 ], 'G': [ 1, -3, 1, 0, -5, 5, -2, -3, -2, -4, -3, 0, 0, -1, -3, 1, 0, -1, -7, -5 ], 'H': [ -1, -3, 1, 1, -2, -2, 6, -2, 0, -2, -2, 2, 0, 3, 2, -1, -1, -2, -3, 0 ], 'I': [ -1, -2, -2, -2, 1, -3, -2, 5, -2, 2, 2, -2, -2, -2, -2, -1, 0, 4, -5, -1 ], 'K': [ -1, -5, 0, 0, -5, -2, 0, -2, 5, -3, 0, 1, -1, 1, 3, 0, 0, -2, -3, -4 ], 'L': [ -2, -6, -4, -3, 2, -4, -2, 2, -3, 6, 4, -3, -3, -2, -3, -3, -2, 2, -2, -1 ], 'M': [ -1, -5, -3, -2, 0, -3, -2, 2, 0, 4, 6, -2, -2, -1, 0, -2, -1, 2, -4, -2 ], 'N': [ 0, -4, 2, 1, -3, 0, 2, -2, 1, -3, -2, 2, 0, 1, 0, 1, 0, -2, -4, -2 ], 'P': [ 1, -3, -1, -1, -5, 0, 0, -2, -1, -3, -2, 0, 6, 0, 0, 1, 0, -1, -6, -5 ], 'Q': [ 0, -5, 2, 2, -5, -1, 3, -2, 1, -2, -1, 1, 0, 4, 1, -1, -1, -2, -5, -4 ], 'R': [ -2, -4, -1, -1, -4, -3, 2, -2, 3, -3, 0, 0, 0, 1, 6, 0, -1, -2, 2, -4 ], 'S': [ 1, 0, 0, 0, -3, 1, -1, -1, 0, -3, -2, 1, 1, -1, 0, 2, 1, -1, -2, -3 ], 'T': [ 1, -2, 0, 0, -3, 0, -1, 0, 0, -2, -1, 0, 0, -1, -1, 1, 3, 0, -5, -3 ], 'V': [ 0, -2, -2, -2, -1, -1, -2, 4, -2, 2, 2, -2, -1, -2, -2, -1, 0, 4, -6, -2 ], 'W': [ -6, -8, -7, -7, 0, -7, -3, -5, -3, -2, -4, -4, -6, -5, 2, -2, -5, -6, 17, 0 ], 'Y': [ -3, 0, -4, -4, 7, -5, 0, -1, -4, -1, -2, -2, -5, -4, -4, -3, -3, -2, 0, 10 ], 'p': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], } else: if seqType == 'DNA' or seqType == 'RNA': print( CRED + 'Error: The \'PAM250\' feature is NOT applicable for DNA/RNA.' + CEND) return None else: return None #end-if X = utils.processMono(X, d, args) # print(X.shape) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': None else: if seqType == 'PROT': totalFeature = 20 else: None # end-if save.datasetSave(X, totalFeature, 'pam250')
def generate(X, seqType, args): ''' # Reference: https://doi.org/10.1093/bioinformatics/bty451 (Supp: Table S3) :param X: :param seqType: :param args: :return: ''' ### Group-1: # Column1 --> Hydrophobicity: {A, C, F, G, H, I, L, M, N, P, Q, S, T, V, W, Y} # Column2 --> Normalized Van der Waals volume: {C, F, I, L, M, V, W} # Column3 --> Polarity: {A, C, D, G, P, S, T} # Column4 --> Polarizibility: {C, F, I, L, M, V, W, Y} # Column5 --> Charge: {A, D, G, S, T} # Column6 --> Secondary structures: {D, G, N, P, S} # Column7 --> Solvent accessibility: {A, C, F, G, I, L, V, W} ### Group-2: # Column8 --> Hydrophobicity: {D, E} # Column9 --> Normalized Van der Waals volume: {A, G, H, P, S, T, Y} # Column10 --> Polarity: {E, I, L, N, Q, V} # Column11 --> Polarizibility: {A, G, P, S, T} # Column12 --> Charge: {C, E, I, L, N, P, Q, V} # Column13 --> Secondary structures: {A, E, H, K, L, M, Q, R} # Column14 --> Solvent accessibility: {H, M, P, S, T, Y} ### Group-3: # Column15 --> Hydrophobicity: {K, R} # Column16 --> Normalized Van der Waals volume: {D, E, K, N, Q, R} # Column17 --> Polarity: {F, H, K, M, R, W, Y} # Column18 --> Polarizibility: {D, E, H, K, N, Q, R} # Column19 --> Charge: {F, H, K, M, R, W, Y} # Column20 --> Secondary structures: {C, F, I, T, V, W, Y} # Column21 --> Solvent accessibility: {D, E, K, N, R, Q} if seqType == 'PROT': d = { 'A': [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0], 'R': [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1], 'N': [1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1], 'D': [0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1], 'C': [1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0], 'Q': [1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1], 'E': [0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1], 'G': [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0], 'H': [1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0], 'I': [1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0], 'L': [1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0], 'K': [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1], 'M': [ 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0 ], 'F': [ 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0 ], 'P': [ 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0 ], 'S': [ 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0 ], 'T': [ 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0 ], 'W': [ 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0 ], 'Y': [ 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0 ], 'V': [ 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 ], 'p': [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], } else: if seqType == 'DNA' or seqType == 'RNA': print( CRED + 'Error: The \'Physicochemical Properties-P2\' feature is NOT applicable for DNA/RNA.' + CEND) return None else: return None #end-if X = utils.processMono(X, d, args) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': None else: if seqType == 'PROT': totalFeature = 21 else: None # end-if save.datasetSave(X, totalFeature, 'pcpP2')
def generate(X, seqType, args): ''' # Reference: repRNA :param X: :param seqType: :param args: :return: ''' if seqType == 'DNA': d = { 'GGG': [5.7000, 5.8500, 3.0000, 13.0000, 5.8270, 5.8270, 3.3110, 3.8680, 622.4000, 103.3887, 6.0000,3.5360, ], 'GGA': [6.2000, 5.0000, 2.0000, -5.0000, 4.9907, 4.9907, 3.8190, 3.5810, 622.4000, 103.3887, 3.8000,4.7990, ], 'GGC': [8.2000, 9.1000, 3.0000, 45.0000, 9.0823, 9.0823, 1.3870, 2.4480, 622.4000, 103.3887, 10.0000,1.3090, ], 'GGT': [5.2000, 5.3000, 2.0000, 8.0000, 5.3160, 5.3160, 3.6190, 4.1560, 622.4000, 103.3887, 5.4000,3.8780, ], 'GAG': [6.6000, 6.0000, 2.0000, 8.0000, 5.9806, 5.9806, 3.2210, 3.3530, 621.4000, 103.2226, 5.4000,3.8780, ], 'GAA': [5.1000, 4.0500, 1.0000, -12.0000, 4.0633, 4.0633, 4.3850, 4.2140, 621.4000, 103.2226, 3.0000,5.2640, ], 'GAC': [5.6000, 5.5000, 2.0000, 8.0000, 5.5164, 5.5164, 3.4980, 3.9250, 621.4000, 103.2226, 5.4000,3.8780, ], 'GAT': [3.6000, 4.4500, 1.0000, 7.0000, 4.4432, 4.4432, 4.1530, 5.0870, 621.4000, 103.2226, 5.3000,3.9350, ], 'GCG': [4.3000, 5.9000, 3.0000, 25.0000, 5.8914, 5.8914, 3.2750, 4.6780, 622.4000, 103.3887, 7.5000,2.6910, ], 'GCA': [7.5000, 6.7500, 2.0000, 13.0000, 6.7553, 6.7553, 2.7540, 2.8420, 622.4000, 103.3887, 6.0000,3.5360, ], 'GCC': [8.2000, 9.1000, 3.0000, 45.0000, 9.0823, 9.0823, 1.3870, 2.4480, 622.4000, 103.3887, 10.0000, 1.3090, ], 'GCT': [6.3000, 6.9000, 2.0000, 25.0000, 6.8829, 6.8829, 2.6830, 3.5240, 622.4000, 103.3887, 7.5000,2.6910, ], 'GTG': [6.8000, 6.6500, 2.0000, 17.0000, 6.6255, 6.6255, 2.8320, 3.2390, 621.4000, 103.2226, 6.5000,3.2530, ], 'GTA': [6.4000, 5.0500, 1.0000, -6.0000, 5.0673, 5.0673, 3.7700, 3.4670, 621.4000, 103.2226, 3.7000,4.8570, ], 'GTC': [5.6000, 5.5000, 2.0000, 8.0000, 5.5164, 5.5164, 3.4980, 3.9250, 621.4000, 103.2226, 5.4000,3.8780, ], 'GTT': [1.6000, 2.6500, 1.0000, -6.0000, 2.6412, 2.6412, 5.2600, 6.2720, 621.4000, 103.2226, 3.7000,4.8570, ], 'AGG': [4.7000, 5.0500, 2.0000, 8.0000, 5.0523, 5.0523, 3.7820, 4.4450, 622.4000, 103.3887, 5.4000,3.8780, ], 'AGA': [6.5000, 4.9000, 1.0000, -9.0000, 4.8884, 4.8884, 3.8790, 3.4100, 622.4000, 103.3887, 3.3000,5.0890, ], 'AGC': [6.3000, 6.9000, 2.0000, 25.0000, 6.8829, 6.8829, 2.6830, 3.5240, 622.4000, 103.3887, 7.5000,2.6910, ], 'AGT': [2.0000, 3.9000, 1.0000, 11.0000, 3.9232, 3.9232, 4.4710, 6.0330, 622.4000, 103.3887, 5.8000,3.6500, ], 'AAG': [4.2000, 4.7000, 1.0000, 6.0000, 4.6992, 4.6992, 3.9950, 4.7360, 621.4000, 103.2226, 5.2000,3.9920, ], 'AAA': [0.1000, 0.0500, 0.0000, -36.0000, 0.0633, 0.0633, 6.8820, 7.1760, 621.4000, 103.2226, 0.0000,7.0450, ], 'AAC': [1.6000, 2.6500, 1.0000, -6.0000, 2.6412, 2.6412, 5.2600, 6.2720, 621.4000, 103.2226, 3.7000,4.8570, ], 'AAT': [0.0000, 0.3500, 0.0000, -30.0000, 0.3500, 0.3500, 6.6980, 7.2370, 621.4000, 103.2226, 0.7000,6.6240, ], 'ACG': [5.2000, 5.3000, 2.0000, 8.0000, 5.3055, 5.3055, 3.6250, 4.1560, 622.4000, 103.3887, 5.4000,3.8780, ], 'ACA': [5.8000, 5.5000, 1.0000, 6.0000, 5.4903, 5.4903, 3.5160, 3.8100, 622.4000, 103.3887, 5.2000,3.9920, ], 'ACC': [5.2000, 5.3000, 2.0000, 8.0000, 5.3160, 5.3160, 3.6190, 4.1560, 622.4000, 103.3887, 5.4000,3.8780, ], 'ACT': [2.0000, 3.9000, 1.0000, 11.0000, 3.9232, 3.9232, 4.4710, 6.0330, 622.4000, 103.3887, 5.8000,3.6500, ], 'ATG': [8.7000, 7.7000, 1.0000, 18.0000, 7.7171, 7.7171, 2.1850, 2.1690, 621.4000, 103.2226, 6.7000,3.1400, ], 'ATA': [9.7000, 6.2500, 0.0000, -13.0000, 6.2734, 6.2734, 3.0470, 1.6130, 621.4000, 103.2226, 2.8000,5.3810, ], 'ATC': [3.6000, 4.4500, 1.0000, 7.0000, 4.4432, 4.4432, 4.1530, 5.0870, 621.4000, 103.2226, 5.3000,3.9350, ], 'ATT': [0.0000, 0.3500, 0.0000, -30.0000, 0.3500, 0.3500, 6.6980, 7.2370, 621.4000, 103.2226, 0.7000,6.6240, ], 'CGG': [3.0000, 3.8500, 3.0000, 2.0000, 3.8690, 3.8690, 4.5020, 5.4400, 622.4000, 103.3887, 4.7000,4.2790, ], 'CGA': [5.8000, 7.0500, 2.0000, 31.0000, 7.0720, 7.0720, 2.5700, 3.8100, 622.4000, 103.3887, 8.3000,2.2450, ], 'CGC': [4.3000, 5.9000, 3.0000, 25.0000, 5.8914, 5.8914, 3.2750, 4.6780, 622.4000, 103.3887, 7.5000,2.6910, ], 'CGT': [5.2000, 5.3000, 2.0000, 8.0000, 5.3055, 5.3055, 3.6250, 4.1560, 622.4000, 103.3887, 5.4000,3.8780, ], 'CAG': [9.6000, 6.9000, 2.0000, -2.0000, 6.8996, 6.8996, 2.6710, 1.6680, 621.4000, 103.2226, 4.2000,4.5670, ], 'CAA': [6.2000, 4.7500, 1.0000, -9.0000, 4.7618, 4.7618, 3.9580, 3.5810, 621.4000, 103.2226, 3.3000,5.0890, ], 'CAC': [6.8000, 6.6500, 2.0000, 17.0000, 6.6255, 6.6255, 2.8320, 3.2390, 621.4000, 103.2226, 6.5000,3.2530, ], 'CAT': [8.7000, 7.7000, 1.0000, 18.0000, 7.7171, 7.7171, 2.1850, 2.1690, 621.4000, 103.2226, 6.7000,3.1400, ], 'CCG': [3.0000, 3.8500, 3.0000, 2.0000, 3.8690, 3.8690, 4.5020, 5.4400, 622.4000, 103.3887, 4.7000,4.2790, ], 'CCA': [0.7000, 3.0500, 2.0000, 8.0000, 3.0587, 3.0587, 5.0000, 6.8130, 622.4000, 103.3887, 5.4000,3.8780, ], 'CCC': [5.7000, 5.8500, 3.0000, 13.0000, 5.8270, 5.8270, 3.3110, 3.8680, 622.4000, 103.3887, 6.0000,3.5360, ], 'CCT': [4.7000, 5.0500, 2.0000, 8.0000, 5.0523, 5.0523, 3.7820, 4.4450, 622.4000, 103.3887, 5.4000,3.8780, ], 'CTG': [9.6000, 6.9000, 2.0000, -2.0000, 6.8996, 6.8996, 2.6710, 1.6680, 621.4000, 103.2226, 4.2000,4.5670, ], 'CTA': [7.8000, 5.0000, 1.0000, -18.0000, 5.0030, 5.0030, 3.8130, 2.6730, 621.4000, 103.2226, 2.2000,5.7340, ], 'CTC': [6.6000, 6.0000, 2.0000, 8.0000, 5.9806, 5.9806, 3.2210, 3.3530, 621.4000, 103.2226, 5.4000,3.8780, ], 'CTT': [4.2000, 4.7000, 1.0000, 6.0000, 4.6992, 4.6992, 3.9950, 4.7360, 621.4000, 103.2226, 5.2000,3.9920, ], 'TGG': [0.7000, 3.0500, 2.0000, 8.0000, 3.0587, 3.0587, 5.0000, 6.8130, 622.4000, 103.3887, 5.4000,3.8780, ], 'TGA': [10.0000, 7.7000, 1.0000, 8.0000, 7.7000, 7.7000, 10.0000, 1.4470, 622.4000, 103.3887, 5.4000,3.8780, ], 'TGC': [7.5000, 6.7500, 2.0000, 13.0000, 6.7553, 6.7553, 2.7540, 2.8420, 622.4000, 103.3887, 6.0000,3.5360, ], 'TGT': [5.8000, 5.5000, 1.0000, 6.0000, 5.4903, 5.4903, 3.5160, 3.8100, 622.4000, 103.3887, 5.2000,3.9920, ], 'TAG': [7.8000, 5.0000, 1.0000, -18.0000, 5.0030, 5.0030, 3.8130, 2.6730, 621.4000, 103.2226, 2.2000,5.7340, ], 'TAA': [7.3000, 4.6500, 0.0000, -20.0000, 4.6709, 4.6709, 4.0130, 2.9550, 621.4000, 103.2226, 2.0000,5.8520, ], 'TAC': [6.4000, 5.0500, 1.0000, -6.0000, 5.0673, 5.0673, 3.7700, 3.4670, 621.4000, 103.2226, 3.7000,4.8570, ], 'TAT': [9.7000, 6.2500, 0.0000, -13.0000, 6.2734, 6.2734, 3.0470, 1.6130, 621.4000, 103.2226, 2.8000,5.3810, ], 'TCG': [5.8000, 7.0500, 2.0000, 31.0000, 7.0720, 7.0720, 2.5700, 3.8100, 622.4000, 103.3887, 8.3000,2.2450, ], 'TCA': [10.0000, 7.7000, 1.0000, 8.0000, 7.7000, 7.7000, 2.1970, 1.4470, 622.4000, 103.3887, 5.4000,3.8780, ], 'TCC': [6.2000, 5.0000, 2.0000, -5.0000, 4.9907, 4.9907, 3.8190, 3.5810, 622.4000, 103.3887, 3.8000,4.7990, ], 'TCT': [6.5000, 4.9000, 1.0000, -9.0000, 4.8884, 4.8884, 3.8790, 3.4100, 622.4000, 103.3887, 3.3000,5.0890, ], 'TTG': [6.2000, 4.7500, 1.0000, -9.0000, 4.7618, 4.7618, 3.9580, 3.5810, 621.4000, 103.2226, 3.3000,5.0890, ], 'TTA': [7.3000, 4.6500, 0.0000, -20.0000, 4.6709, 4.6709, 4.0130, 2.9550, 621.4000, 103.2226, 2.0000,5.8520, ], 'TTC': [5.1000, 4.0500, 1.0000, -12.0000, 4.0633, 4.0633, 4.3850, 4.2140, 621.4000, 103.2226, 3.0000,5.2640, ], 'TTT': [0.1000, 0.0500, 0.0000, -36.0000, 0.0633, 0.0633, 0.1000, 7.1760, 621.4000, 103.2226, 0.0000,7.0450, ], 'p' : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # padding } else: if seqType == 'PROT' or seqType == 'RNA': print(CRED + 'Error: The \'Physicochemical Properties-D2\' feature is NOT applicable for PROT and RNA.' + CEND) return None else: None #end-if # print(X) X = utils.processTri(X, d, args) # print(X.shape) totalFeature = 0 if seqType == 'DNA': totalFeature = 12 else: if seqType == 'PROT' or seqType == 'RNA': None else: None # end-if save.datasetSave(X, totalFeature, 'pcpD2') #end-def