def CKSAAGP(fastas, gap = 5, **kw): if gap < 0: print('Error: the gap should be equal or greater than zero' + '\n\n') return 0 if checkFasta.minSequenceLength(fastas) < gap+2: print('Error: all the sequence length should be greater than the (gap value) + 2 = ' + str(gap+2) + '\n\n') return 0 group = { 'alphaticr': 'GAVLMI', 'aromatic': 'FYW', 'postivecharger': 'KRH', 'negativecharger': 'DE', 'uncharger': 'STCPNQ' } AA = 'ARNDCQEGHILKMFPSTWYV' groupKey = group.keys() index = {} for key in groupKey: for aa in group[key]: index[aa] = key gPairIndex = [] for key1 in groupKey: for key2 in groupKey: gPairIndex.append(key1+'.'+key2) encodings = [] header = ['#'] for g in range(gap + 1): for p in gPairIndex: header.append(p+'.gap'+str(g)) encodings.append(header) for i in fastas: name, sequence = i[0], re.sub('-', '', i[1]) code = [name] for g in range(gap + 1): gPair = generateGroupPairs(groupKey) sum = 0 for p1 in range(len(sequence)): p2 = p1 + g + 1 if p2 < len(sequence) and sequence[p1] in AA and sequence[p2] in AA: gPair[index[sequence[p1]]+'.'+index[sequence[p2]]] = gPair[index[sequence[p1]]+'.'+index[sequence[p2]]] + 1 sum = sum + 1 if sum == 0: for gp in gPairIndex: code.append(0) else: for gp in gPairIndex: code.append(gPair[gp] / sum) encodings.append(code) return encodings
def PAAC(fastas, lambdaValue=1, w=0.05, **kw): if checkFasta.minSequenceLength(fastas) < lambdaValue + 1: print( 'Error: all the sequence length should be larger than the lambdaValue+1: ' + str(lambdaValue + 1) + '\n\n') return 0 dataFile = re.sub( 'codes$', '', os.path.split(os.path.realpath(__file__)) [0]) + r'\data\PAAC.txt' if platform.system() == 'Windows' else re.sub( 'codes$', '', os.path.split(os.path.realpath(__file__))[0]) + '/data/PAAC.txt' with open(dataFile) as f: records = f.readlines() AA = ''.join(records[0].rstrip().split()[1:]) AADict = {} for i in range(len(AA)): AADict[AA[i]] = i AAProperty = [] AAPropertyNames = [] for i in range(1, len(records)): array = records[i].rstrip().split( ) if records[i].rstrip() != '' else None AAProperty.append([float(j) for j in array[1:]]) AAPropertyNames.append(array[0]) AAProperty1 = [] for i in AAProperty: meanI = sum(i) / 20 fenmu = math.sqrt(sum([(j - meanI)**2 for j in i]) / 20) AAProperty1.append([(j - meanI) / fenmu for j in i]) encodings = [] header = ['#'] for aa in AA: header.append('Xc1.' + aa) for n in range(1, lambdaValue + 1): header.append('Xc2.lambda' + str(n)) encodings.append(header) for i in fastas: name, sequence = i[0], re.sub('-', '', i[1]) code = [name] theta = [] for n in range(1, lambdaValue + 1): theta.append( sum([ Rvalue(sequence[j], sequence[j + n], AADict, AAProperty1) for j in range(len(sequence) - n) ]) / (len(sequence) - n)) myDict = {} for aa in AA: myDict[aa] = sequence.count(aa) / 100 code = code + [myDict[aa] / (1 + w * sum(theta)) for aa in AA] code = code + [(w * j) / (1 + w * sum(theta)) for j in theta] encodings.append(code) return encodings
def CKSAAP(fastas, gap=5, **kw): if gap < 0: print('Error: the gap should be equal or greater than zero' + '\n\n') return 0 if checkFasta.minSequenceLength(fastas) < gap + 2: print( 'Error: all the sequence length should be larger than the (gap value) + 2 = ' + str(gap + 2) + '\n\n') return 0 AA = 'ACDEFGHIKLMNPQRSTVWY' encodings = [] aaPairs = [] for aa1 in AA: for aa2 in AA: aaPairs.append(aa1 + aa2) header = ['#'] for g in range(gap + 1): for aa in aaPairs: header.append(aa + '.gap' + str(g)) encodings.append(header) for i in fastas: name, sequence = i[0], i[1] code = [name] for g in range(gap + 1): myDict = {} for pair in aaPairs: myDict[pair] = 0 sum = 0 for index1 in range(len(sequence)): index2 = index1 + gap + 1 if index1 < len(sequence) and index2 < len( sequence ) and sequence[index1] in AA and sequence[index2] in AA: myDict[sequence[index1] + sequence[index2]] = myDict[sequence[index1] + sequence[index2]] + 1 sum = sum + 1 for pair in aaPairs: code.append(myDict[pair] / sum) encodings.append(code) return encodings
def EAAC(fastas, window=5, **kw): if checkFasta.checkFasta(fastas) == False: print( 'Error: for "EAAC" encoding, the input fasta sequences should be with equal length. \n\n' ) return 0 if window < 1: print('Error: the sliding window should be greater than zero' + '\n\n') return 0 if checkFasta.minSequenceLength(fastas) < window: print( 'Error: all the sequence length should be larger than the sliding window :' + str(window) + '\n\n') return 0 AA = kw['order'] if kw['order'] != None else 'ACDEFGHIKLMNPQRSTVWY' #AA = 'ARNDCQEGHILKMFPSTWYV' encodings = [] header = ['#'] for w in range(1, len(fastas[0][1]) - window + 2): for aa in AA: header.append('SW.' + str(w) + '.' + aa) encodings.append(header) for i in fastas: name, sequence = i[0], i[1] code = [name] for j in range(len(sequence)): if j < len(sequence) and j + window <= len(sequence): count = Counter(re.sub('-', '', sequence[j:j + window])) for key in count: count[key] = count[key] / len( re.sub('-', '', sequence[j:j + window])) for aa in AA: code.append(count[aa]) encodings.append(code) return encodings
def NMBroto(fastas, props=[ 'CIDH920105', 'BHAR880101', 'CHAM820101', 'CHAM820102', 'CHOC760101', 'BIGC670101', 'CHAM810101', 'DAYM780201' ], nlag=6, **kw): if checkFasta.minSequenceLength(fastas) < nlag + 1: print( 'Error: all the sequence length should be larger than the nlag+1: ' + str(nlag + 1) + '\n\n') return 0 AA = 'ARNDCQEGHILKMFPSTWYV' fileAAidx = re.sub('codes$', '', os.path.split(os.path.realpath(__file__)) [0]) + r'\data\AAidx.txt' if platform.system( ) == 'Windows' else sys.path[0] + '/data/AAidx.txt' with open(fileAAidx) as f: records = f.readlines()[1:] myDict = {} for i in records: array = i.rstrip().split('\t') myDict[array[0]] = array[1:] AAidx = [] AAidxName = [] for i in props: if i in myDict: AAidx.append(myDict[i]) AAidxName.append(i) else: print('"' + i + '" properties not exist.') return None AAidx1 = np.array([float(j) for i in AAidx for j in i]) AAidx = AAidx1.reshape((len(AAidx), 20)) pstd = np.std(AAidx, axis=1) pmean = np.average(AAidx, axis=1) for i in range(len(AAidx)): for j in range(len(AAidx[i])): AAidx[i][j] = (AAidx[i][j] - pmean[i]) / pstd[i] index = {} for i in range(len(AA)): index[AA[i]] = i encodings = [] header = ['#'] for p in props: for n in range(1, nlag + 1): header.append(p + '.lag' + str(n)) encodings.append(header) for i in fastas: name, sequence = i[0], re.sub('-', '', i[1]) code = [name] N = len(sequence) for prop in range(len(props)): for n in range(1, nlag + 1): if len(sequence) > nlag: # if key is '-', then the value is 0 rn = sum([ AAidx[prop][index.get(sequence[j], 0)] * AAidx[prop][index.get(sequence[j + n], 0)] for j in range(len(sequence) - n) ]) / (N - n) else: rn = 'NA' code.append(rn) encodings.append(code) return encodings