def get_phyche_index(k, phyche_list, alphabet): """get phyche_value according phyche_list.""" phyche_value = {} if 0 == len(phyche_list): for nucleotide in make_kmer_list(k, alphabet): phyche_value[nucleotide] = [] return phyche_value nucleotide_phyche_value = get_phyche_factor_dic(k, alphabet) for nucleotide in make_kmer_list(k, alphabet): if nucleotide not in phyche_value: phyche_value[nucleotide] = [] for e in nucleotide_phyche_value[nucleotide]: if e[0] in phyche_list: phyche_value[nucleotide].append(e[1]) return phyche_value
def make_pseknc_vector(sequence_list, phyche_value, k=2, w=0.05, lamada=1, alphabet=index_list.DNA, theta_type=1): """Generate the pseknc vector.""" kmer = make_kmer_list(k, alphabet) vector = [] for sequence in sequence_list: if len(sequence) < k or lamada + k > len(sequence): error_info = "Sorry, the sequence length must be larger than " + str( lamada + k) sys.stderr.write(error_info) sys.exit(0) # Get the nucleotide frequency in the DNA sequence. fre_list = [frequency(sequence, str(key)) for key in kmer] fre_sum = float(sum(fre_list)) # Get the normalized occurrence frequency of nucleotide in the DNA sequence. fre_list = [e / fre_sum for e in fre_list] # Get the theta_list. if 1 == theta_type: theta_list = get_parallel_factor(k, lamada, sequence, phyche_value, alphabet) elif 2 == theta_type: theta_list = get_series_factor(k, lamada, sequence, phyche_value, alphabet) elif 3 == theta_type: theta_list = get_parallel_factor(k=2, lamada=lamada, sequence=sequence, phyche_value=phyche_value, alphabet=alphabet) theta_sum = sum(theta_list) # Generate the vector according the Equation 9. denominator = 1 + w * theta_sum temp_vec = [round(f / denominator, 8) for f in fre_list] for theta in theta_list: temp_vec.append(round(w * theta / denominator, 8)) vector.append(temp_vec) return vector
def make_pseknc_vector(sequence_list, phyche_value, k=2, w=0.05, lamada=1, alphabet=index_list.DNA, theta_type=1): """Generate the pseknc vector.""" kmer = make_kmer_list(k, alphabet) vector = [] for sequence in sequence_list: if len(sequence) < k or lamada + k > len(sequence): error_info = "Sorry, the sequence length must be larger than " + str(lamada + k) sys.stderr.write(error_info) sys.exit(0) # Get the nucleotide frequency in the DNA sequence. fre_list = [frequency(sequence, str(key)) for key in kmer] fre_sum = float(sum(fre_list)) # Get the normalized occurrence frequency of nucleotide in the DNA sequence. fre_list = [e / fre_sum for e in fre_list] # Get the theta_list. if 1 == theta_type: theta_list = get_parallel_factor(k, lamada, sequence, phyche_value, alphabet) elif 2 == theta_type: theta_list = get_series_factor(k, lamada, sequence, phyche_value, alphabet) elif 3 == theta_type: theta_list = get_parallel_factor(k=2, lamada=lamada, sequence=sequence, phyche_value=phyche_value, alphabet=alphabet) theta_sum = sum(theta_list) # Generate the vector according the Equation 9. denominator = 1 + w * theta_sum temp_vec = [round(f / denominator, 8) for f in fre_list] for theta in theta_list: temp_vec.append(round(w * theta / denominator, 8)) vector.append(temp_vec) return vector