Beispiel #1
0
def PAAC(fastas, lambdaValue=2, w=0.05, **kw):
    if checkFasta.minSequenceLengthWithNormalAA(fastas) < lambdaValue + 1:
        print(
            'Error: all the sequence length should be larger than the lambdaValue+1: '
            + str(lambdaValue + 1) + '\n\n')
        return 0

    dataFile = re.sub(
        'codes$', '',
        os.path.split(os.path.realpath(__file__))
        [0]) + r'\data\PAAC.txt' if platform.system() == 'Windows' else re.sub(
            'codes$', '',
            os.path.split(os.path.realpath(__file__))[0]) + '/data/PAAC.txt'
    with open(dataFile) as f:
        records = f.readlines()
    AA = ''.join(records[0].rstrip().split()[1:])
    AADict = {}
    for i in range(len(AA)):
        AADict[AA[i]] = i
    AAProperty = []
    AAPropertyNames = []
    for i in range(1, len(records)):  #len(records)is4
        array = records[i].rstrip().split(
        ) if records[i].rstrip() != '' else None
        AAProperty.append([float(j) for j in array[1:]])
        AAPropertyNames.append(array[0])

    AAProperty1 = []
    for i in AAProperty:
        meanI = sum(i) / 20
        fenmu = math.sqrt(sum([(j - meanI)**2 for j in i]) / 20)
        AAProperty1.append([(j - meanI) / fenmu for j in i])

    encodings = []
    header = ['#']
    for aa in AA:
        header.append('Xc1.' + aa)
    for n in range(1, lambdaValue + 1):
        header.append('Xc2.lambda' + str(n))
    encodings.append(header)

    for i in fastas:
        name, sequence = i[0], re.sub('-', '', i[1])
        code = [name]
        theta = []
        for n in range(1, lambdaValue + 1):
            theta.append(
                sum([
                    Rvalue(sequence[j], sequence[j + n], AADict, AAProperty1)
                    for j in range(len(sequence) - n)
                ]) / (len(sequence) - n))
        myDict = {}
        for aa in AA:
            myDict[aa] = sequence.count(aa)
        code = code + [myDict[aa] / (1 + w * sum(theta)) for aa in AA]
        code = code + [(w * j) / (1 + w * sum(theta)) for j in theta]
        encodings.append(code)
    return encodings
Beispiel #2
0
def SOCNumber(fastas, nlag=30, **kw):
	if checkFasta.minSequenceLengthWithNormalAA(fastas) < nlag + 1:
		print('Error: all the sequence length should be larger than the nlag+1: ' + str(nlag + 1) + '\n\n')
		return 0

	dataFile = re.sub('codes$', '', os.path.split(os.path.realpath(__file__))[0]) + r'\data\Schneider-Wrede.txt' if platform.system() == 'Windows' else re.sub('codes$', '', os.path.split(os.path.realpath(__file__))[0]) + '/data/Schneider-Wrede.txt'
	dataFile1 = re.sub('codes$', '', os.path.split(os.path.realpath(__file__))[0]) + r'\data\Grantham.txt' if platform.system() == 'Windows' else re.sub('codes$', '', os.path.split(os.path.realpath(__file__))[0]) + '/data/Grantham.txt'
	AA = 'ACDEFGHIKLMNPQRSTVWY'
	AA1 = 'ARNDCQEGHILKMFPSTWYV'

	DictAA = {}
	for i in range(len(AA)):
		DictAA[AA[i]] = i

	DictAA1 = {}
	for i in range(len(AA1)):
		DictAA1[AA1[i]] = i

	with open(dataFile) as f:
		records = f.readlines()[1:]
	AADistance = []
	for i in records:
		array = i.rstrip().split()[1:] if i.rstrip() != '' else None
		AADistance.append(array)
	AADistance = np.array(
		[float(AADistance[i][j]) for i in range(len(AADistance)) for j in range(len(AADistance[i]))]).reshape((20, 20))

	with open(dataFile1) as f:
		records = f.readlines()[1:]
	AADistance1 = []
	for i in records:
		array = i.rstrip().split()[1:] if i.rstrip() != '' else None
		AADistance1.append(array)
	AADistance1 = np.array(
		[float(AADistance1[i][j]) for i in range(len(AADistance1)) for j in range(len(AADistance1[i]))]).reshape(
		(20, 20))

	encodings = []
	header = ['#']
	for n in range(1, nlag + 1):
		header.append('Schneider.lag' + str(n))
	for n in range(1, nlag + 1):
		header.append('gGrantham.lag' + str(n))
	encodings.append(header)

	for i in fastas:
		name, sequence = i[0], re.sub('-', '', i[1])
		code = [name]
		for n in range(1, nlag + 1):
			code.append(sum(
				[AADistance[DictAA[sequence[j]]][DictAA[sequence[j + n]]] ** 2 for j in range(len(sequence) - n)]) / (
						len(sequence) - n))

		for n in range(1, nlag + 1):
			code.append(sum([AADistance1[DictAA1[sequence[j]]][DictAA1[sequence[j + n]]] ** 2 for j in
							 range(len(sequence) - n)]) / (len(sequence) - n))
		encodings.append(code)
	return encodings
Beispiel #3
0
def NMBroto(fastas,
            props=[
                'CIDH920105', 'BHAR880101', 'CHAM820101', 'CHAM820102',
                'CHOC760101', 'BIGC670101', 'CHAM810101', 'DAYM780201'
            ],
            nlag=5,
            **kw):
    if checkFasta.minSequenceLengthWithNormalAA(fastas) < nlag + 1:
        print(
            'Error: all the sequence length should be larger than the nlag+1: '
            + str(nlag + 1) + '\n\n')
        return 0

    AA = 'ARNDCQEGHILKMFPSTWYV'
    fileAAidx = re.sub('codes$', '',
                       os.path.split(os.path.realpath(__file__))
                       [0]) + r'\data\AAidx.txt' if platform.system(
                       ) == 'Windows' else sys.path[0] + '/data/AAidx.txt'
    with open(fileAAidx) as f:
        records = f.readlines()[1:]
    myDict = {}
    for i in records:
        array = i.rstrip().split('\t')
        myDict[array[0]] = array[1:]

    AAidx = []
    AAidxName = []
    for i in props:
        if i in myDict:
            AAidx.append(myDict[i])
            AAidxName.append(i)
        else:
            print('"' + i + '" properties not exist.')
            return None

    AAidx1 = np.array([float(j) for i in AAidx for j in i])
    AAidx = AAidx1.reshape((len(AAidx), 20))
    pstd = np.std(AAidx, axis=1)
    pmean = np.average(AAidx, axis=1)

    for i in range(len(AAidx)):
        for j in range(len(AAidx[i])):
            AAidx[i][j] = (AAidx[i][j] - pmean[i]) / pstd[i]

    index = {}
    for i in range(len(AA)):
        index[AA[i]] = i

    encodings = []
    header = ['#']
    for p in props:
        for n in range(1, nlag + 1):
            header.append(p + '.lag' + str(n))
    encodings.append(header)

    for i in fastas:
        name, sequence = i[0], re.sub('-', '', i[1])
        code = [name]
        N = len(sequence)
        for prop in range(len(props)):
            for n in range(1, nlag + 1):
                if len(sequence) > nlag:
                    # if key is '-', then the value is 0
                    rn = sum([
                        AAidx[prop][index.get(sequence[j], 0)] *
                        AAidx[prop][index.get(sequence[j + n], 0)]
                        for j in range(len(sequence) - n)
                    ]) / (N - n)
                else:
                    rn = 'NA'
                code.append(rn)
        encodings.append(code)
    return encodings