def get_entropy_profile_per_sequence(seq, w, alias, out=None): """ sliding window entropy profile of all sequences in a family :param fasta: a fasta file contatining viral sequences :param w: the window size :param out: optional. if != None a profile will be saved as a png :return: the vector of profile entropy """ all_entropies = {} entropies = [] # get identifier and genomic sequence genome = seq for j in range(len(genome) - w): sub_genome = genome[j:j + w] entropy = entropy_by_kmer(sub_genome, 5) entropies.append(entropy) df = pd.DataFrame({'{}'.format(alias): entropies}) if out != None: df.to_csv(os.path.join(out, '{}_profile.csv'.format(alias)), index=False) return df
def get_entropy_profile(fasta, w, out=None, type='fasta'): """ sliding window entropy profile of all sequences in a family :param fasta: a fasta file contatining viral sequences :param w: the window size :param out: optional. if != None a profile will be saved as a png :return: the vector of profile entropy """ all_entropies = {} alias = os.path.basename(fasta).split('.')[0] i = 0 for rec in SeqIO.parse(fasta, type): entropies = [] # get identifier and genomic sequence genome = str(rec.seq) for j in range(len(genome) - w): sub_genome = genome[j:j + w] entropy = entropy_by_kmer(sub_genome, 5) entropies.append(entropy) print('Done with seq {}'.format(i)) all_entropies['seq_{}'.format(i)] = entropies i += 1 df = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in all_entropies.items()])) df.to_csv(os.path.join(out, '{}_profile.csv'.format(alias)), index=False) return df
def test_real_data_per_seq(seq, data, w, index): """ predict for each sequence in fasta its label :param fasta: sequence :param data: the data used to build the classifier :param w: the size of the sequence in the genome profile :param index: an index for the sequence. this will be used in the output df :return: """ seq = seq.lower() # fit a random forest model X = data[['entropy', 'lzw', 'a', 'c', 'g', 't']] # Features y = data['y'] # Labels clf = RandomForestClassifier(n_estimators=100) # Train the model using the training sets y_pred=clf.predict(X_test) clf.fit(X, y) # iterate over the sequences and get all features needed for prediction dfs = [] for j in tqdm(range(len(seq) - w)): sub_genome = seq[j:j + w] n = len(sub_genome) entropy = entropy_by_kmer(sub_genome, 5) lzw = len(compress(sub_genome)) df = pd.DataFrame({'entropy':entropy, 'lzw':lzw, 'a':sub_genome.count('a') / n,'c':sub_genome.count('c') / n, 'g': sub_genome.count('g') / n,'t':sub_genome.count('t') / n}, index=[j]) dfs.append(df) result = pd.concat(dfs) # predict new labels y_pred = clf.predict(result) return pd.DataFrame({'predicted_{}'.format(index):y_pred})
def genome_2_entropy(fasta, k): """ create a mapping of the complete genome to its entropy by kmer :param fasta: a file containing all fasta files :param k: the k-mer size :return: a data frame of ref-seq id and entropy measurement """ k_entropy = [] ref_seq_id = [] sequences = re.split(">", open(fasta, "r").read().replace('\n', ''))[1:] for seq in tqdm(sequences): if '.' not in seq: print('no dot in sequence name\n') continue if 'complete genome' not in seq: print('not complete genome\n') continue # get identifier and genomic sequence splitted = seq.split('.') identifier = remove_punctuation(splitted[0].split()[0]) #identifier = splitted[0].split()[0] genome = splitted[-1] # calculate entropy entropy = entropy_by_kmer(genome, k) k_entropy.append(entropy) ref_seq_id.append(identifier) df = pd.DataFrame({'refseq_id':ref_seq_id, 'entropy_{}'.format(k):k_entropy}) return df
def create_data_matrix(alphabet, w): """ create a matrix of the input data :param alphabet: a list containing the wanted alphabet :param w: the size of the sequence generated :return: a data frame with the input data """ dfs = [] for seq, p in sequences_generator(alphabet, w): entropy = entropy_by_kmer(seq, 5) lzw = len(compress(seq)) df = pd.DataFrame( { 'entropy': entropy, 'lzw': lzw, 'a': [0], 'c': p[1], 'g': p[2], 't': p[3] }, index=[0]) dfs.append(df) result = pd.concat(dfs) return result
def simulate_dataset(n, size): """ simulate sequences from different classes ( up to 4 : repetitive, repetitive with stem loops, random, only stem loop :param n: number of sequences to simulate :param size: the size of each sequence :return: a data frame containing a sequence, entropy and joint entropy, together with a type indicating the class """ sequences = [] cluster = [] # repetitive sequences for i in tqdm(range(n)): cluster_name = 'Repetitive' seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.6, 0.2, 0.1, 0.1], size=size)) sequences.append(seq) cluster.append(cluster_name) # create a data frame with all information df_rep = pd.DataFrame({'sequence':sequences, 'cluster':cluster}) df_rep['entropy'] = df_rep['sequence'].apply(lambda x: entropy_by_kmer(x,5)) df_rep['joint_entropy'] = df_rep['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5)) # normalize bpth entropy and joint entropy to 0-1 df_rep['entropy'] = df_rep['entropy'] / df_rep['entropy'].max() df_rep['joint_entropy'] = df_rep['joint_entropy'] / df_rep['joint_entropy'].max() sequences = [] cluster = [] # repetitive sequences + structure - generate a perfect stem loop for i in tqdm(range(n)): cluster_name = 'Repetitive + Stem loop' seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.6, 0.2, 0.1, 0.1], size=size//2)) seq = seq + str(get_reverse_complement(seq)) sequences.append(seq) cluster.append(cluster_name) # create a data frame with all information df_rep_st = pd.DataFrame({'sequence':sequences, 'cluster':cluster}) df_rep_st['entropy'] = df_rep_st['sequence'].apply(lambda x: entropy_by_kmer(x,5)) df_rep_st['joint_entropy'] = df_rep_st['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5)) # normalize bpth entropy and joint entropy to 0-1 df_rep_st['entropy'] = df_rep_st['entropy'] / df_rep_st['entropy'].max() df_rep_st['joint_entropy'] = df_rep_st['joint_entropy'] / df_rep_st['joint_entropy'].max() sequences = [] cluster = [] # only structure - generate a perfect stem loop for i in tqdm(range(n)): cluster_name = 'Stem loop' seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.25, 0.25, 0.25, 0.25], size=size//2)) seq = seq + str(get_reverse_complement(seq)) sequences.append(seq) cluster.append(cluster_name) # create a data frame with all information df_st = pd.DataFrame({'sequence':sequences, 'cluster':cluster}) df_st['entropy'] = df_st['sequence'].apply(lambda x: entropy_by_kmer(x,5)) df_st['joint_entropy'] = df_st['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5)) # normalize bpth entropy and joint entropy to 0-1 df_st['entropy'] = df_st['entropy'] / df_st['entropy'].max() df_st['joint_entropy'] = df_st['joint_entropy'] / df_st['joint_entropy'].max() sequences = [] cluster = [] # random for i in tqdm(range(n)): cluster_name = 'Random' seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.25, 0.25, 0.25, 0.25], size=size)) seq = seq + str(get_reverse_complement(seq)) sequences.append(seq) cluster.append(cluster_name) # create a data frame with all information df_rand = pd.DataFrame({'sequence': sequences, 'cluster': cluster}) df_rand['entropy'] = df_rand['sequence'].apply(lambda x: entropy_by_kmer(x, 5)) df_rand['joint_entropy'] = df_rand['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5)) # normalize bpth entropy and joint entropy to 0-1 df_rand['entropy'] = df_rand['entropy'] / df_rand['entropy'].max() df_rand['joint_entropy'] = df_rand['joint_entropy'] / df_rand['joint_entropy'].max() # combine all inputs to one df, and return it result = pd.concat([df_rep, df_rep_st, df_st, df_rand]) return result