def PAAC(fastas, lambdaValue=2, w=0.05, **kw): if checkFasta.minSequenceLengthWithNormalAA(fastas) < lambdaValue + 1: print( 'Error: all the sequence length should be larger than the lambdaValue+1: ' + str(lambdaValue + 1) + '\n\n') return 0 dataFile = re.sub( 'codes$', '', os.path.split(os.path.realpath(__file__)) [0]) + r'\data\PAAC.txt' if platform.system() == 'Windows' else re.sub( 'codes$', '', os.path.split(os.path.realpath(__file__))[0]) + '/data/PAAC.txt' with open(dataFile) as f: records = f.readlines() AA = ''.join(records[0].rstrip().split()[1:]) AADict = {} for i in range(len(AA)): AADict[AA[i]] = i AAProperty = [] AAPropertyNames = [] for i in range(1, len(records)): #len(records)is4 array = records[i].rstrip().split( ) if records[i].rstrip() != '' else None AAProperty.append([float(j) for j in array[1:]]) AAPropertyNames.append(array[0]) AAProperty1 = [] for i in AAProperty: meanI = sum(i) / 20 fenmu = math.sqrt(sum([(j - meanI)**2 for j in i]) / 20) AAProperty1.append([(j - meanI) / fenmu for j in i]) encodings = [] header = ['#'] for aa in AA: header.append('Xc1.' + aa) for n in range(1, lambdaValue + 1): header.append('Xc2.lambda' + str(n)) encodings.append(header) for i in fastas: name, sequence = i[0], re.sub('-', '', i[1]) code = [name] theta = [] for n in range(1, lambdaValue + 1): theta.append( sum([ Rvalue(sequence[j], sequence[j + n], AADict, AAProperty1) for j in range(len(sequence) - n) ]) / (len(sequence) - n)) myDict = {} for aa in AA: myDict[aa] = sequence.count(aa) code = code + [myDict[aa] / (1 + w * sum(theta)) for aa in AA] code = code + [(w * j) / (1 + w * sum(theta)) for j in theta] encodings.append(code) return encodings
def SOCNumber(fastas, nlag=30, **kw): if checkFasta.minSequenceLengthWithNormalAA(fastas) < nlag + 1: print('Error: all the sequence length should be larger than the nlag+1: ' + str(nlag + 1) + '\n\n') return 0 dataFile = re.sub('codes$', '', os.path.split(os.path.realpath(__file__))[0]) + r'\data\Schneider-Wrede.txt' if platform.system() == 'Windows' else re.sub('codes$', '', os.path.split(os.path.realpath(__file__))[0]) + '/data/Schneider-Wrede.txt' dataFile1 = re.sub('codes$', '', os.path.split(os.path.realpath(__file__))[0]) + r'\data\Grantham.txt' if platform.system() == 'Windows' else re.sub('codes$', '', os.path.split(os.path.realpath(__file__))[0]) + '/data/Grantham.txt' AA = 'ACDEFGHIKLMNPQRSTVWY' AA1 = 'ARNDCQEGHILKMFPSTWYV' DictAA = {} for i in range(len(AA)): DictAA[AA[i]] = i DictAA1 = {} for i in range(len(AA1)): DictAA1[AA1[i]] = i with open(dataFile) as f: records = f.readlines()[1:] AADistance = [] for i in records: array = i.rstrip().split()[1:] if i.rstrip() != '' else None AADistance.append(array) AADistance = np.array( [float(AADistance[i][j]) for i in range(len(AADistance)) for j in range(len(AADistance[i]))]).reshape((20, 20)) with open(dataFile1) as f: records = f.readlines()[1:] AADistance1 = [] for i in records: array = i.rstrip().split()[1:] if i.rstrip() != '' else None AADistance1.append(array) AADistance1 = np.array( [float(AADistance1[i][j]) for i in range(len(AADistance1)) for j in range(len(AADistance1[i]))]).reshape( (20, 20)) encodings = [] header = ['#'] for n in range(1, nlag + 1): header.append('Schneider.lag' + str(n)) for n in range(1, nlag + 1): header.append('gGrantham.lag' + str(n)) encodings.append(header) for i in fastas: name, sequence = i[0], re.sub('-', '', i[1]) code = [name] for n in range(1, nlag + 1): code.append(sum( [AADistance[DictAA[sequence[j]]][DictAA[sequence[j + n]]] ** 2 for j in range(len(sequence) - n)]) / ( len(sequence) - n)) for n in range(1, nlag + 1): code.append(sum([AADistance1[DictAA1[sequence[j]]][DictAA1[sequence[j + n]]] ** 2 for j in range(len(sequence) - n)]) / (len(sequence) - n)) encodings.append(code) return encodings
def NMBroto(fastas, props=[ 'CIDH920105', 'BHAR880101', 'CHAM820101', 'CHAM820102', 'CHOC760101', 'BIGC670101', 'CHAM810101', 'DAYM780201' ], nlag=5, **kw): if checkFasta.minSequenceLengthWithNormalAA(fastas) < nlag + 1: print( 'Error: all the sequence length should be larger than the nlag+1: ' + str(nlag + 1) + '\n\n') return 0 AA = 'ARNDCQEGHILKMFPSTWYV' fileAAidx = re.sub('codes$', '', os.path.split(os.path.realpath(__file__)) [0]) + r'\data\AAidx.txt' if platform.system( ) == 'Windows' else sys.path[0] + '/data/AAidx.txt' with open(fileAAidx) as f: records = f.readlines()[1:] myDict = {} for i in records: array = i.rstrip().split('\t') myDict[array[0]] = array[1:] AAidx = [] AAidxName = [] for i in props: if i in myDict: AAidx.append(myDict[i]) AAidxName.append(i) else: print('"' + i + '" properties not exist.') return None AAidx1 = np.array([float(j) for i in AAidx for j in i]) AAidx = AAidx1.reshape((len(AAidx), 20)) pstd = np.std(AAidx, axis=1) pmean = np.average(AAidx, axis=1) for i in range(len(AAidx)): for j in range(len(AAidx[i])): AAidx[i][j] = (AAidx[i][j] - pmean[i]) / pstd[i] index = {} for i in range(len(AA)): index[AA[i]] = i encodings = [] header = ['#'] for p in props: for n in range(1, nlag + 1): header.append(p + '.lag' + str(n)) encodings.append(header) for i in fastas: name, sequence = i[0], re.sub('-', '', i[1]) code = [name] N = len(sequence) for prop in range(len(props)): for n in range(1, nlag + 1): if len(sequence) > nlag: # if key is '-', then the value is 0 rn = sum([ AAidx[prop][index.get(sequence[j], 0)] * AAidx[prop][index.get(sequence[j + n], 0)] for j in range(len(sequence) - n) ]) / (N - n) else: rn = 'NA' code.append(rn) encodings.append(code) return encodings