def KNNprotein(fastas, **kw): trainFile = kw['train'] labelFile = kw['label'] if os.path.exists(labelFile) == False: print('Error: the label file does not exist.') sys.exit(1) if trainFile == None or labelFile == None: print( 'Error: please specify the directory of train file ["--train"] and the label file ["--label"]' ) sys.exit(1) trainData = readFasta.readFasta(trainFile) with open(labelFile) as f: records = f.readlines() myLabel = {} for i in records: array = i.rstrip().split() if i.strip() != '' else None myLabel[array[0]] = int(array[1]) myLabelSets = list(set(myLabel.values())) if len(trainData) != len(myLabel): print('ERROR: the inconsistent sample number in train and label file.') sys.exit(1) kValues = [ 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30 ] kNum = [] for i in kValues: kNum.append(math.ceil(len(trainData) * i)) encodings = [] header = ['#'] for k in kValues: for l in myLabelSets: header.append('Top' + str(k) + '.label' + str(l)) encodings.append(header) for i in fastas: name, sequence = i[0], re.sub('[^ARNDCQEGHILKMFPSTWYV-]', '', i[1]) code = [name] mySimilarity = [] for j in range(len(trainData)): if name != trainData[j][0]: mySimilarity.append([ myLabel[trainData[j][0]], CalculateSimilarity( re.sub('[^ARNDCQEGHILKMFPSTWYV]', '', trainData[j][1]), sequence) ]) mySimilarity = np.array(mySimilarity) mySimilarity = mySimilarity[np.lexsort(-mySimilarity.T)] for j in kNum: code = code + CalculateContent(mySimilarity, j, myLabelSets) encodings.append(code) return encodings
def get_feature(protein_fasta_file, feature_type, output_dir): fastas = readFasta.readFasta(protein_fasta_file) userDefinedOrder = 'ACDEFGHIKLMNPQRSTVWY' myAAorder = { 'alphabetically': 'ACDEFGHIKLMNPQRSTVWY', 'polarity': 'DENKRQHSGTAPYVMCWIFL', 'sideChainVolume': 'GASDPCTNEVHQILMKRFYW', 'userDefined': userDefinedOrder } myOrder = 'ACDEFGHIKLMNPQRSTVWY' kw = {'order': myOrder} myFun = f"{feature_type}.{feature_type}(fastas, **kw)" print('Descriptor type: ' + feature_type) encodings = eval(myFun) outFile = f'{output_dir}{feature_type}.tsv' saveCode.savetsv(encodings, outFile) return
if os.path.exists(outDir) == False: os.mkdir(outDir) for i in fastas: name, sequence = re.sub('\|', '', i[0]), i[1] with open(name + '.txt', 'w') as f: f.write('>' + name + '\n' + sequence + '\n') myCmd = psipred + ' ' + name + '.txt' if os.path.exists(outDir + '/' + name + '.ss2') == False: os.system(myCmd) os.remove(name + '.txt') os.remove(name + '.ss') os.remove(name + '.horiz') shutil.move(name + '.ss2', outDir) return outDir if __name__ == '__main__': parser = argparse.ArgumentParser( usage="it's usage tip.", description="generate protein secondary structure profile") parser.add_argument("--file", required=True, help="protein sequence file in fasta format") parser.add_argument("--psipred", help="the path of psipred program") args = parser.parse_args() psipred = args.psipred if args.psipred != None else 'runpsipred' fastas = readFasta.readFasta(args.file) outputDir = generateSecondaryStructure(fastas, 'out', psipred) print('The predicted secodnary structure are stored in: ' + outputDir)
myDistance.append([ myLabel[trainData[j][0]], CalculateDistance(trainData[j][1], sequence) ]) myDistance = np.array(myDistance) myDistance = myDistance[np.lexsort(myDistance.T)] for j in kNum: code = code + CalculateContent(myDistance, j, myLabelSets) encodings.append(code) return encodings fastas = readFasta.readFasta("data_test.txt") kw = { 'path': "D:\\xw\\3_特征提取\\KNN\\codes\\codes", 'train': "data_test.txt", 'label': "label_test.txt", 'order': 'ACDEFGHIKLMNPQRSTVWY' } #kw= {'path': "D:\\xw\\S-sulfenylation_特征提取\\KNN\\KNN0710\\codes",'train':"data_train.txt",'label':"label_train.txt",'order':'ACDEFGHIKLMNPQRSTVWY'} data_KNN = KNNpeptide(fastas, **kw) data_raw = data_KNN[1:] data_new = np.matrix(data_raw) data_knn = data_new[:, 1:] column = data_knn.shape[1] data_knn_feature = data_knn[:, np.array(range(1, column, 2))] test = pd.DataFrame(data=data_knn_feature)
encodings.append(header) for i in fastas: name, sequence = i[0], i[1] code = [name] for aa in sequence: if aa == '-': for j in AAindex: code.append(0) continue for j in AAindex: code.append(j[index[aa]]) encodings.append(code) return encodings fastas = readFasta.readFasta(r"F:\python\KNN\test_A.txt") kw = { 'path': r"F:\python\KNN", 'train': r"F:\python\KNN\test_A.txt", 'label': r"F:\python\KNN\label_A.txt", 'order': 'ACDEFGHIKLMNPQRSTVWY' } result = AAINDEX(fastas, **kw) data = result[1:] data_new = np.matrix(data) data_AAindex = data_new[:, 1:] data_AAindex_end = pd.DataFrame(data=data_AAindex) data_AAindex_end.to_csv('AAindex.csv')
def KNNpeptide(fastas, **kw): trainFile = kw['train'] labelFile = kw['label'] if trainFile == None or labelFile == None: print('Error: please specify the directory of train file ["--train"] and the label file ["--label"]') sys.exit(1) if os.path.exists(labelFile) == False: print('Error: the label file does not exist.') sys.exit(1) trainData = readFasta.readFasta(trainFile) with open(labelFile) as f: records = f.readlines() myLabel = {} countme=1 for i in records: #print(str(countme)) array = i.rstrip().split('\t') if i.strip('\t') != '' else None if array[0] in myLabel: print (array[0]) myLabel[array[0]] = int(array[1]) #print(array[0]) #print(array[1]) countme=countme+1 myLabelSets = list(set(myLabel.values())) if len(trainData) != len(myLabel): print(len(myLabel)) print(len(trainData)) print(len(myLabel)) print('ERROR: inconsistent sample number between train and label file.') sys.exit(1) #這邊是照比率的話 kValues = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10] kNum = [] for i in kValues: kNum.append(math.ceil(len(trainData) * i)) #kNum = [2,4,8,16,32,64] encodings = [] header = ['#'] for k in kNum: for l in myLabelSets: header.append('Top' + str(k) + '.label' + str(l)) encodings.append(header) for i in fastas: name, sequence = i[0], i[1] code = [name] myDistance = [] #print(name) #print(sequence) for j in range(len(trainData)): if name != trainData[j][0]: myDistance.append([myLabel[trainData[j][0]], CalculateDistance(trainData[j][1], sequence)]) #print("hello") myDistance = np.array(myDistance) myDistance = myDistance[np.lexsort(myDistance.T)] for j in kNum: code = code + CalculateContent(myDistance, j, myLabelSets) encodings.append(code) return encodings
return distance def CalculateContent(myDistance, j, myLabelSets): content = [] myDict = {} for i in myLabelSets: myDict[i] = 0 for i in range(j): myDict[myDistance[i][0]] = myDict[myDistance[i][0]] + 1 for i in myLabelSets: content.append(myDict[myLabelSets[i]] / j) return content ffastas = readFasta.readFasta(r"F:\python\KNN\test_A.txt") kw = { 'path': r"F:\python\KNN", 'train': r"F:\python\KNN\test_A.txt", 'label': r"F:\python\KNN\label_A.txt", 'order': 'ACDEFGHIKLMNPQRSTVWY' } trainFile = kw['train'] labelFile = kw['label'] if trainFile == None or labelFile == None: print( 'Error: please specify the directory of train file ["--train"] and the label file ["--label"]' ) sys.exit(1) if os.path.exists(labelFile) == False: