def BINARY(fastas, **kw): if checkFasta.checkFasta(fastas) == False: print( 'Error: for "BINARY" encoding, the input fasta sequences should be with equal length. \n\n' ) return 0 AA = 'ARNDCQEGHILKMFPSTWYVX' encodings = [] header = ['#'] for i in range(1, len(fastas[0][1]) * 21 + 1): header.append('BINARY.F' + str(i)) encodings.append(header) for i in fastas: name, sequence = i[0], i[1] code = [name] for aa in sequence: if aa == '-': code = code + [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] break for aa1 in AA: tag = 1 if aa == aa1 else 0 code.append(tag) encodings.append(code) return encodings
def PSSM(fastas, **kw): if checkFasta.checkFasta(fastas) == False: print( 'Error: for "PSSM" encoding, the input fasta sequences should be with equal length. \n\n' ) return 0 pssmDir = kw['path'] if pssmDir == None: print( 'Error: please specify the directory of predicted protein disorder files by "--path" \n\n' ) return 0 AA = 'ARNDCQEGHILKMFPSTWYV' encodings = [] header = ['#'] for p in range(1, len(fastas[0][1]) + 1): for aa in AA: header.append('Pos.' + str(p) + '.' + aa) encodings.append(header) for i in fastas: name, sequence = i[0], i[1] code = [name] if os.path.exists(pssmDir + '/' + name + '.pssm') == False: print('Error: pssm prfile for protein ' + name + ' does not exist.') #sys.exit(1) proteinSeq = '' pssmMatrix = [] continue with open(pssmDir + '/' + name + '.pssm') as f: records = f.readlines()[3:29] proteinSeq = '' pssmMatrix = [] for line in records: if (len(line) == 1): print("got u") continue array = line.strip().split() pssmMatrix.append(array[2:22]) proteinSeq = proteinSeq + array[1] pos = proteinSeq.find(sequence) if pos == -1: print('Warning: could not find the peptide in proteins.\n\n') else: for p in range(pos, pos + len(sequence)): code = code + pssmMatrix[p] encodings.append(code) return encodings
def DisorderB(fastas, **kw): if checkFasta.checkFasta(fastas) == False: print( 'Error: for "DisorderB" encoding, the input fasta sequences should be with equal length. \n\n' ) return 0 disDir = kw['path'] if disDir == None: print( 'Error: please specify the directory of predicted protein disorder files by "--path"' ) return 0 encodings = [] header = ['#'] for p in range(1, 2 * len(fastas[0][1]) + 1): header.append('disorderB.F' + str(p)) encodings.append(header) for i in fastas: name, sequence = i[0], i[1] code = [name] if os.path.exists(disDir + '/' + name + '.dis') == False: print( 'Error: the predicted disorder information file (.dis) for protein ' + name + ' does not exist.') return 0 with open(disDir + '/' + name + '.dis') as f: records = f.readlines() tag = 0 for i in range(len(records)): if re.search('^-------', records[i]): tag = i break records = records[tag + 1:-1] proteinSeq = '' disValue = [] myDict = {'D': [0, 1], 'O': [1, 0]} for line in records: array = line.rstrip().split() if line.rstrip() != '' else None key = array[3] if array[3] == 'D' else 'O' proteinSeq = proteinSeq + array[1] disValue.append(key) pos = proteinSeq.find(sequence) if pos == -1: print('Warning: could not find the peptide in proteins.\n\n') else: for p in range(pos, pos + len(sequence)): code = code + myDict[disValue[p]] encodings.append(code) return encodings
def AAINDEX(fastas, **kw): if checkFasta.checkFasta(fastas) == False: print( 'Error: for "AAINDEX" encoding, the input fasta sequences should be with equal length. \n\n' ) return 0 AA = 'ARNDCQEGHILKMFPSTWXYV' fileAAindex = re.sub( 'codes$', '', os.path.split(os.path.realpath(__file__)) [0]) + r'\data\AAINDEXwithX.txt' if platform.system( ) == 'Windows' else re.sub( 'codes$', '', os.path.split(os.path.realpath(__file__))[0]) + '/data/AAindex.txt' with open(fileAAindex) as f: records = f.readlines()[1:] AAindex = [] AAindexName = [] for i in records: AAindex.append(i.rstrip().split()[1:] if i.rstrip() != '' else None) AAindexName.append(i.rstrip().split()[0] if i.rstrip() != '' else None) index = {} for i in range(len(AA)): index[AA[i]] = i encodings = [] header = ['#'] for pos in range(1, len(fastas[0][1]) + 1): for idName in AAindexName: header.append('SeqPos.' + str(pos) + '.' + idName) encodings.append(header) for i in fastas: name, sequence = i[0], i[1] code = [name] for aa in sequence: if aa == '-': for j in AAindex: code.append(0) continue for j in AAindex: code.append(j[index[aa]]) encodings.append(code) return encodings
def TA(fastas, **kw): if checkFasta.checkFasta(fastas) == False: print( 'Error: for "TA" encoding, the input fasta sequences should be with equal length. \n\n' ) return 0 encodings = [] header = ['#'] for p in range(1, len(fastas[0][1]) + 1): header.append('TA.F' + str(p) + '.phi') header.append('TA.F' + str(p) + '.psi') encodings.append(header) disDir = kw['path'] if disDir == None: print( 'Error: please specify the directory of predicted protein TA file by "--path"' ) return 0 for i in fastas: name, sequence = i[0], i[1] code = [name] if os.path.exists(disDir + '/' + name + '.dis') == False: print( 'Error: the predicted TA information file (.spXout) for protein ' + name + ' does not exist.') return 0 with open(disDir + '/' + name + '.spXout') as f: records = f.readlines()[1:] proteinSeq = '' asaValue = [] for line in records: array = line.strip().split() if line.strip() != '' else None proteinSeq = proteinSeq + array[1] asaValue.append(array[3:5]) pos = proteinSeq.find(sequence) if pos == -1: print('Warning: could not find the peptide in proteins.\n\n') else: for p in range(pos, pos + len(sequence)): code.append(asaValue[p][0]) code.append(asaValue[p][1]) encodings.append(code) return encodings
def ZSCALE(fastas, **kw): if checkFasta.checkFasta(fastas) == False: print( 'Error: for "ZSCALE" encoding, the input fasta sequences should be with equal length. \n\n' ) return 0 zscale = { 'A': [0.24, -2.32, 0.60, -0.14, 1.30], # A 'C': [0.84, -1.67, 3.71, 0.18, -2.65], # C 'D': [3.98, 0.93, 1.93, -2.46, 0.75], # D 'E': [3.11, 0.26, -0.11, -0.34, -0.25], # E 'F': [-4.22, 1.94, 1.06, 0.54, -0.62], # F 'G': [2.05, -4.06, 0.36, -0.82, -0.38], # G 'H': [2.47, 1.95, 0.26, 3.90, 0.09], # H 'I': [-3.89, -1.73, -1.71, -0.84, 0.26], # I 'K': [2.29, 0.89, -2.49, 1.49, 0.31], # K 'L': [-4.28, -1.30, -1.49, -0.72, 0.84], # L 'M': [-2.85, -0.22, 0.47, 1.94, -0.98], # M 'N': [3.05, 1.62, 1.04, -1.15, 1.61], # N 'P': [-1.66, 0.27, 1.84, 0.70, 2.00], # P 'Q': [1.75, 0.50, -1.44, -1.34, 0.66], # Q 'R': [3.52, 2.50, -3.50, 1.99, -0.17], # R 'S': [2.39, -1.07, 1.15, -1.39, 0.67], # S 'T': [0.75, -2.18, -1.12, -1.46, -0.40], # T 'V': [-2.59, -2.64, -1.54, -0.85, -0.02], # V 'W': [-4.36, 3.94, 0.59, 3.44, -1.59], # W 'Y': [-2.54, 2.44, 0.43, 0.04, -1.47], # Y '-': [0.00, 0.00, 0.00, 0.00, 0.00], # - } encodings = [] header = ['#'] for p in range(1, len(fastas[0][1]) + 1): for z in ('1', '2', '3', '4', '5'): header.append('Pos' + str(p) + '.ZSCALE' + z) encodings.append(header) for i in fastas: name, sequence = i[0], i[1] code = [name] for aa in sequence: code = code + zscale[aa] encodings.append(code) return encodings
def EGAAC(fastas, window=5, **kw): if checkFasta.checkFasta(fastas) == False: print( 'Error: for "EGAAC" encoding, the input fasta sequences should be with equal length. \n\n' ) return 0 if window < 1: print('Error: the sliding window should be greater than zero' + '\n\n') return 0 group = { 'alphaticr': 'GAVLMI', 'aromatic': 'FYW', 'postivecharger': 'KRH', 'negativecharger': 'DE', 'uncharger': 'STCPNQ' } groupKey = group.keys() encodings = [] header = ['#'] for w in range(1, len(fastas[0][1]) - window + 2): for g in groupKey: header.append('SW.' + str(w) + '.' + g) encodings.append(header) for i in fastas: name, sequence = i[0], i[1] code = [name] for j in range(len(sequence)): if j + window <= len(sequence): count = Counter(sequence[j:j + window]) myDict = {} for key in groupKey: for aa in group[key]: myDict[key] = myDict.get(key, 0) + count[aa] for key in groupKey: code.append(myDict[key] / window) encodings.append(code) return encodings
def BLOSUM62(fastas, **kw): if checkFasta.checkFasta(fastas) == False: print('Error: for "BLOSUM62" encoding, the input fasta sequences should be with equal length. \n\n') return 0 blosum62 = { 'A': [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0], # A 'R': [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3], # R 'N': [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3], # N 'D': [-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3], # D 'C': [0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1], # C 'Q': [-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2], # Q 'E': [-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2], # E 'G': [0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3], # G 'H': [-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3], # H 'I': [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3], # I 'L': [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1], # L 'K': [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2], # K 'M': [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1], # M 'F': [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1], # F 'P': [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2], # P 'S': [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2], # S 'T': [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0], # T 'W': [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3], # W 'Y': [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1], # Y 'V': [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4], # V '-': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # - } encodings = [] header = ['#'] for i in range(1, len(fastas[0][1]) * 20 + 1): header.append('blosum62.F'+str(i)) encodings.append(header) for i in fastas: name, sequence = i[0], i[1] code = [name] for aa in sequence: code = code + blosum62[aa] encodings.append(code) return encodings
def EAAC(fastas, window=5, **kw): if checkFasta.checkFasta(fastas) == False: print( 'Error: for "EAAC" encoding, the input fasta sequences should be with equal length. \n\n' ) return 0 if window < 1: print('Error: the sliding window should be greater than zero' + '\n\n') return 0 if checkFasta.minSequenceLength(fastas) < window: print( 'Error: all the sequence length should be larger than the sliding window :' + str(window) + '\n\n') return 0 AA = kw['order'] if kw['order'] != None else 'ACDEFGHIKLMNPQRSTVWY' #AA = 'ARNDCQEGHILKMFPSTWYV' encodings = [] header = ['#'] for w in range(1, len(fastas[0][1]) - window + 2): for aa in AA: header.append('SW.' + str(w) + '.' + aa) encodings.append(header) for i in fastas: name, sequence = i[0], i[1] code = [name] for j in range(len(sequence)): if j < len(sequence) and j + window <= len(sequence): count = Counter(re.sub('-', '', sequence[j:j + window])) for key in count: count[key] = count[key] / len( re.sub('-', '', sequence[j:j + window])) for aa in AA: code.append(count[aa]) encodings.append(code) return encodings