Beispiel #1
0
def KNNprotein(fastas, **kw):
    trainFile = kw['train']
    labelFile = kw['label']

    if os.path.exists(labelFile) == False:
        print('Error: the label file does not exist.')
        sys.exit(1)

    if trainFile == None or labelFile == None:
        print(
            'Error: please specify the directory of train file ["--train"] and the label file ["--label"]'
        )
        sys.exit(1)
    trainData = readFasta.readFasta(trainFile)
    with open(labelFile) as f:
        records = f.readlines()
    myLabel = {}
    for i in records:
        array = i.rstrip().split() if i.strip() != '' else None
        myLabel[array[0]] = int(array[1])
    myLabelSets = list(set(myLabel.values()))

    if len(trainData) != len(myLabel):
        print('ERROR: the inconsistent sample number in train and label file.')
        sys.exit(1)

    kValues = [
        0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.11, 0.12,
        0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22, 0.23, 0.24,
        0.25, 0.26, 0.27, 0.28, 0.29, 0.30
    ]
    kNum = []
    for i in kValues:
        kNum.append(math.ceil(len(trainData) * i))

    encodings = []
    header = ['#']
    for k in kValues:
        for l in myLabelSets:
            header.append('Top' + str(k) + '.label' + str(l))
    encodings.append(header)

    for i in fastas:
        name, sequence = i[0], re.sub('[^ARNDCQEGHILKMFPSTWYV-]', '', i[1])
        code = [name]
        mySimilarity = []
        for j in range(len(trainData)):
            if name != trainData[j][0]:
                mySimilarity.append([
                    myLabel[trainData[j][0]],
                    CalculateSimilarity(
                        re.sub('[^ARNDCQEGHILKMFPSTWYV]', '', trainData[j][1]),
                        sequence)
                ])
        mySimilarity = np.array(mySimilarity)
        mySimilarity = mySimilarity[np.lexsort(-mySimilarity.T)]
        for j in kNum:
            code = code + CalculateContent(mySimilarity, j, myLabelSets)
        encodings.append(code)
    return encodings
Beispiel #2
0
def get_feature(protein_fasta_file, feature_type, output_dir):
	fastas = readFasta.readFasta(protein_fasta_file)
	userDefinedOrder = 'ACDEFGHIKLMNPQRSTVWY'
	myAAorder = {
		'alphabetically': 'ACDEFGHIKLMNPQRSTVWY',
		'polarity': 'DENKRQHSGTAPYVMCWIFL',
		'sideChainVolume': 'GASDPCTNEVHQILMKRFYW',
		'userDefined': userDefinedOrder
	}

	myOrder = 'ACDEFGHIKLMNPQRSTVWY'
	kw = {'order': myOrder}

	myFun = f"{feature_type}.{feature_type}(fastas, **kw)"
	print('Descriptor type: ' + feature_type)
	encodings = eval(myFun)
	outFile = f'{output_dir}{feature_type}.tsv'
	saveCode.savetsv(encodings, outFile)
	return
    if os.path.exists(outDir) == False:
        os.mkdir(outDir)

    for i in fastas:
        name, sequence = re.sub('\|', '', i[0]), i[1]
        with open(name + '.txt', 'w') as f:
            f.write('>' + name + '\n' + sequence + '\n')
        myCmd = psipred + ' ' + name + '.txt'
        if os.path.exists(outDir + '/' + name + '.ss2') == False:
            os.system(myCmd)
            os.remove(name + '.txt')
            os.remove(name + '.ss')
            os.remove(name + '.horiz')
            shutil.move(name + '.ss2', outDir)
    return outDir


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        usage="it's usage tip.",
        description="generate protein secondary structure profile")
    parser.add_argument("--file",
                        required=True,
                        help="protein sequence file in fasta format")
    parser.add_argument("--psipred", help="the path of psipred program")
    args = parser.parse_args()

    psipred = args.psipred if args.psipred != None else 'runpsipred'
    fastas = readFasta.readFasta(args.file)
    outputDir = generateSecondaryStructure(fastas, 'out', psipred)
    print('The predicted secodnary structure are stored in: ' + outputDir)
Beispiel #4
0
                myDistance.append([
                    myLabel[trainData[j][0]],
                    CalculateDistance(trainData[j][1], sequence)
                ])

        myDistance = np.array(myDistance)
        myDistance = myDistance[np.lexsort(myDistance.T)]

        for j in kNum:
            code = code + CalculateContent(myDistance, j, myLabelSets)
        encodings.append(code)

    return encodings


fastas = readFasta.readFasta("data_test.txt")
kw = {
    'path': "D:\\xw\\3_特征提取\\KNN\\codes\\codes",
    'train': "data_test.txt",
    'label': "label_test.txt",
    'order': 'ACDEFGHIKLMNPQRSTVWY'
}
#kw=  {'path': "D:\\xw\\S-sulfenylation_特征提取\\KNN\\KNN0710\\codes",'train':"data_train.txt",'label':"label_train.txt",'order':'ACDEFGHIKLMNPQRSTVWY'}
data_KNN = KNNpeptide(fastas, **kw)

data_raw = data_KNN[1:]
data_new = np.matrix(data_raw)
data_knn = data_new[:, 1:]
column = data_knn.shape[1]
data_knn_feature = data_knn[:, np.array(range(1, column, 2))]
test = pd.DataFrame(data=data_knn_feature)
Beispiel #5
0
    encodings.append(header)

    for i in fastas:
        name, sequence = i[0], i[1]
        code = [name]
        for aa in sequence:
            if aa == '-':
                for j in AAindex:
                    code.append(0)
                continue
            for j in AAindex:
                code.append(j[index[aa]])
        encodings.append(code)

    return encodings


fastas = readFasta.readFasta(r"F:\python\KNN\test_A.txt")
kw = {
    'path': r"F:\python\KNN",
    'train': r"F:\python\KNN\test_A.txt",
    'label': r"F:\python\KNN\label_A.txt",
    'order': 'ACDEFGHIKLMNPQRSTVWY'
}
result = AAINDEX(fastas, **kw)
data = result[1:]
data_new = np.matrix(data)
data_AAindex = data_new[:, 1:]
data_AAindex_end = pd.DataFrame(data=data_AAindex)
data_AAindex_end.to_csv('AAindex.csv')
Beispiel #6
0
def KNNpeptide(fastas, **kw):
	trainFile = kw['train']
	labelFile = kw['label']
	if trainFile == None or labelFile == None:
		print('Error: please specify the directory of train file ["--train"] and the label file ["--label"]')
		sys.exit(1)

	if os.path.exists(labelFile) == False:
		print('Error: the label file does not exist.')
		sys.exit(1)

	trainData = readFasta.readFasta(trainFile)
	with open(labelFile) as f:
		records = f.readlines()
	myLabel = {}
	countme=1
	for i in records:
		#print(str(countme))
		array = i.rstrip().split('\t') if i.strip('\t') != '' else None
		if array[0] in myLabel:
			print (array[0])
		myLabel[array[0]] = int(array[1])
		#print(array[0])
		#print(array[1])
		countme=countme+1
	myLabelSets = list(set(myLabel.values()))

	if len(trainData) != len(myLabel):
		print(len(myLabel))
		print(len(trainData))
		print(len(myLabel))
		print('ERROR: inconsistent sample number between train and label file.')
		sys.exit(1)

	#這邊是照比率的話
	kValues = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10]
	kNum = []
	
	for i in kValues:
		kNum.append(math.ceil(len(trainData) * i))

	
	#kNum = [2,4,8,16,32,64]

	encodings = []
	header = ['#']
	for k in kNum:
		for l in myLabelSets:
			header.append('Top' + str(k) + '.label' + str(l))
	encodings.append(header)

	for i in fastas:
		name, sequence = i[0], i[1]
		code = [name]
		myDistance = []
		#print(name)
		#print(sequence)
		for j in range(len(trainData)):
			if name != trainData[j][0]:
				myDistance.append([myLabel[trainData[j][0]], CalculateDistance(trainData[j][1], sequence)])
		#print("hello")
		myDistance = np.array(myDistance)
		myDistance = myDistance[np.lexsort(myDistance.T)]

		for j in kNum:
			code = code + CalculateContent(myDistance, j, myLabelSets)
		encodings.append(code)

	return encodings
Beispiel #7
0
    return distance


def CalculateContent(myDistance, j, myLabelSets):
    content = []
    myDict = {}
    for i in myLabelSets:
        myDict[i] = 0
    for i in range(j):
        myDict[myDistance[i][0]] = myDict[myDistance[i][0]] + 1
    for i in myLabelSets:
        content.append(myDict[myLabelSets[i]] / j)
    return content


ffastas = readFasta.readFasta(r"F:\python\KNN\test_A.txt")
kw = {
    'path': r"F:\python\KNN",
    'train': r"F:\python\KNN\test_A.txt",
    'label': r"F:\python\KNN\label_A.txt",
    'order': 'ACDEFGHIKLMNPQRSTVWY'
}
trainFile = kw['train']
labelFile = kw['label']
if trainFile == None or labelFile == None:
    print(
        'Error: please specify the directory of train file ["--train"] and the label file ["--label"]'
    )
    sys.exit(1)

if os.path.exists(labelFile) == False: