def get_psedpc_matrix(filename, n, r, w, pattern_list=[ 'A', 'C', 'G', 'U', 'A-U', 'U-A', 'G-C', 'C-G', 'G-U', 'U-G' ]): '''This is a complete process in PseSSC, aim to generate feature vector. The FASTA format of the input file is as follows: >sequence name An RNA sequence should be consist of AGCU Secondary structure :param filename: Name of input file. :param n: The maximum distance between structure statuses. :param r: The highest counted rank (or tier) of the structural correlation along a RNA chain. :param w: The wight of theta, from 0.1 to 1. :param pattern_list: Structure statuses, default:['A', 'C', 'G', 'U', 'A-U', 'U-A', 'G-C', 'C-G', 'G-U', 'U-G'].''' with open(filename) as f: seqsslst = get_rnasc_data(f) features = [] for seqss in seqsslst: vector = get_psedpc_vector(seqss, n, r, w, pattern_list) features.append(vector) return features
def get_psedpc_matrix(filename, n, r, w, pattern_list = ['A', 'C', 'G', 'U', 'A-U', 'U-A', 'G-C', 'C-G', 'G-U', 'U-G']): '''This is a complete process in PseSSC, aim to generate feature vector. The FASTA format of the input file is as follows: >sequence name An RNA sequence should be consist of AGCU Secondary structure :param filename: Name of input file. :param n: The maximum distance between structure statuses. :param r: The highest counted rank (or tier) of the structural correlation along a RNA chain. :param w: The wight of theta, from 0.1 to 1. :param pattern_list: Structure statuses, default:['A', 'C', 'G', 'U', 'A-U', 'U-A', 'G-C', 'C-G', 'G-U', 'U-G'].''' with open(filename) as f: seqsslst= get_rnasc_data(f) features = [] for seqss in seqsslst: vector = get_psedpc_vector(seqss, n, r, w, pattern_list) features.append(vector) return features
def get_triplet_matrix(filename): '''This is a complete process in triplet,aim to generate feature vectors. The FASTA format of the input file is as follows: >Sequence name An RNA sequence should be consist of AGCU Secondary structure :param filename: Name of inputfile. :return: Feature matrix through Triplet. ''' letter = ["(","."] alphabet = 'AGCU' #Don't change the alphabetical, or the order of features will change. with open(filename) as f: seqsslst= get_rnasc_data(f) tripletdict = get_triplet_dict(letter, 3, alphabet) features = [] for seqss in seqsslst: vector = get_triplet_vector(seqss, tripletdict) features.append(vector) return features
def get_pseknc_matrix(filename, k): '''This is a complete process in PseKNC,aim to generate feature matrix. The FASTA format of the input file is as follows: >Sequence name An RNA sequence should be consist of AGCU Secondary structure :param filename: Name of input file. :return: Feature matrix through PseKNC. ''' alphabet = 'ACGU' letter = list(alphabet) with open(filename) as f: seqsslst = get_rnasc_data(f) psekncdict = get_pseknc_dict(letter, k) features = [] for seqss in seqsslst: vector = get_pseknc_vector(seqss, psekncdict, k) features.append(vector) return features