Ejemplo n.º 1
0
    encodings = []
    header = ['#']
    for g in range(gap + 1):
        for f in features:
            header.append(f + '.gap' + str(g))
    encodings.append(header)

    for i in fastas:
        name, sequence = i[0], re.sub('-', '', i[1])
        code = [name]
        if len(sequence) < 2 * gap + 3:
            print(
                'Error: for "KSCTriad" encoding, the input fasta sequences should be greater than (2*gap+3). \n\n'
            )
            return 0
        code = code + CalculateKSCTriad(sequence, gap, features, AADict)
        encodings.append(code)

    return encodings


if __name__ == '__main__':
    if len(sys.argv) == 1:
        print(USAGE)
        sys.exit(1)
    fastas = readFasta.readFasta(sys.argv[1])
    k = int(sys.argv[2]) if len(sys.argv) >= 3 else 5
    output = sys.argv[3] if len(sys.argv) >= 4 else 'encoding.tsv'
    encodings = KSCTriad(fastas, k)
    saveCode.savetsv(encodings, output)
Ejemplo n.º 2
0
def CTDCClass(fastas, groups):
	encodings = []
	header = ['#']
	for g in range(len(groups)):
		header.append('g.'+str(g+1))
	encodings.append(header)

	for i in fastas:
		name, sequence = i[0], re.sub('-', '', i[1])
		code = [name]
		for group in groups:
			code.append(Count(group, sequence) / len(sequence))
		encodings.append(code)
	return encodings


if __name__ == '__main__':
	if len(sys.argv) < 5:
		print(USAGE)
		sys.exit(1)

	groups = sys.argv[3:]
	myStr = ''.join(groups)
	myStr = re.sub('[^ACDEFGHIKLMNPQRSTVWY]', '', myStr)
	if len(myStr) != 20 or len(set(myStr)) != 20:
		print('\nERROR: The amino acid must be no-repeat in each groups and the sum is 20!\n')
	fastas = readFasta.readFasta(sys.argv[1])
	encodings = CTDCClass(fastas, groups)
	saveCode.savetsv(encodings, sys.argv[2])
                sum([
                    Rvalue(sequence[j], sequence[j + n], AADict, AAProperty1)
                    for j in range(len(sequence) - n)
                ]) / (len(sequence) - n) if (len(sequence) - n) != 0 else 0)
        myDict = {}
        for aa in AA:
            myDict[aa] = sequence.count(aa)
        code = code + [myDict[aa] / (1 + w * sum(theta)) for aa in AA]
        code = code + [(w * j) / (1 + w * sum(theta)) for j in theta]
        encodings.append(code)
    return encodings


fastas = readFasta.readFasta('datasets_fasta/dataset_balanceado.txt')

encodings = PAAC(fastas)

saveCode.savetsv(encodings, 'BBDD/encodings_dataset_balanceado.tsv')

fastas = readFasta.readFasta('datasets_fasta/All_peptides.txt')

encodings = PAAC(fastas)

saveCode.savetsv(encodings, 'BBDD/encodings_Allpeptides.tsv')

fastas = readFasta.readFasta('datasets_fasta/dataset_ejemplo.csv')

encodings = PAAC(fastas)

saveCode.savetsv(encodings, 'BBDD/encodings_dataset_ejemplo.tsv')