コード例 #1
0
def get_entropy_profile_per_sequence(seq, w, alias, out=None):
    """
    sliding window entropy profile of all sequences in a family
    :param fasta: a fasta file contatining viral sequences
    :param w: the window size
    :param out: optional. if != None a profile will be saved as a png
    :return: the vector of profile entropy
    """
    all_entropies = {}

    entropies = []
    # get identifier and genomic sequence
    genome = seq

    for j in range(len(genome) - w):
        sub_genome = genome[j:j + w]
        entropy = entropy_by_kmer(sub_genome, 5)
        entropies.append(entropy)

    df = pd.DataFrame({'{}'.format(alias): entropies})
    if out != None:
        df.to_csv(os.path.join(out, '{}_profile.csv'.format(alias)),
                  index=False)

    return df
コード例 #2
0
def get_entropy_profile(fasta, w, out=None, type='fasta'):
    """
    sliding window entropy profile of all sequences in a family
    :param fasta: a fasta file contatining viral sequences
    :param w: the window size
    :param out: optional. if != None a profile will be saved as a png
    :return: the vector of profile entropy
    """
    all_entropies = {}
    alias = os.path.basename(fasta).split('.')[0]

    i = 0
    for rec in SeqIO.parse(fasta, type):
        entropies = []
        # get identifier and genomic sequence

        genome = str(rec.seq)

        for j in range(len(genome) - w):
            sub_genome = genome[j:j + w]
            entropy = entropy_by_kmer(sub_genome, 5)
            entropies.append(entropy)

        print('Done with seq {}'.format(i))
        all_entropies['seq_{}'.format(i)] = entropies
        i += 1

    df = pd.DataFrame(
        dict([(k, pd.Series(v)) for k, v in all_entropies.items()]))

    df.to_csv(os.path.join(out, '{}_profile.csv'.format(alias)), index=False)

    return df
コード例 #3
0
ファイル: stats_utils.py プロジェクト: taliaku/SternLab
def test_real_data_per_seq(seq, data, w, index):
    """
    predict for each sequence in fasta its label
    :param fasta: sequence
    :param data: the data used to build the classifier
    :param w: the size of the sequence in the genome profile
    :param index: an index for the sequence. this will be used in the output df
    :return:
    """
    seq = seq.lower()
    # fit a random forest model
    X = data[['entropy', 'lzw', 'a', 'c', 'g', 't']]  # Features
    y = data['y']  # Labels

    clf = RandomForestClassifier(n_estimators=100)

    # Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(X, y)

    # iterate over the sequences and get all features needed for prediction
    dfs = []
    for j in tqdm(range(len(seq) - w)):
        sub_genome = seq[j:j + w]
        n = len(sub_genome)
        entropy = entropy_by_kmer(sub_genome, 5)
        lzw = len(compress(sub_genome))
        df = pd.DataFrame({'entropy':entropy, 'lzw':lzw, 'a':sub_genome.count('a') / n,'c':sub_genome.count('c') / n,
                           'g': sub_genome.count('g') / n,'t':sub_genome.count('t') / n}, index=[j])
        dfs.append(df)

    result =  pd.concat(dfs)

    # predict new labels
    y_pred = clf.predict(result)
    return pd.DataFrame({'predicted_{}'.format(index):y_pred})
コード例 #4
0
def genome_2_entropy(fasta, k):
    """
    create a mapping of the complete genome to its entropy by kmer
    :param fasta: a file containing all fasta files
    :param k: the k-mer size
    :return: a data frame of ref-seq id and entropy measurement
    """

    k_entropy = []
    ref_seq_id = []

    sequences = re.split(">", open(fasta, "r").read().replace('\n', ''))[1:]
    for seq in tqdm(sequences):
        if '.' not in seq:
            print('no dot in sequence name\n')
            continue
        if 'complete genome' not in seq:
            print('not complete genome\n')
            continue

        # get identifier and genomic sequence
        splitted = seq.split('.')
        identifier = remove_punctuation(splitted[0].split()[0])
        #identifier = splitted[0].split()[0]
        genome = splitted[-1]

        # calculate entropy
        entropy = entropy_by_kmer(genome, k)


        k_entropy.append(entropy)
        ref_seq_id.append(identifier)

    df = pd.DataFrame({'refseq_id':ref_seq_id, 'entropy_{}'.format(k):k_entropy})
    return df
コード例 #5
0
def create_data_matrix(alphabet, w):
    """
    create a matrix of the input data
    :param alphabet: a list containing the wanted alphabet
    :param w: the size of the sequence generated
    :return: a data frame with the input data
    """

    dfs = []
    for seq, p in sequences_generator(alphabet, w):
        entropy = entropy_by_kmer(seq, 5)
        lzw = len(compress(seq))
        df = pd.DataFrame(
            {
                'entropy': entropy,
                'lzw': lzw,
                'a': [0],
                'c': p[1],
                'g': p[2],
                't': p[3]
            },
            index=[0])
        dfs.append(df)

    result = pd.concat(dfs)
    return result
コード例 #6
0
ファイル: stats_utils.py プロジェクト: taliaku/SternLab
def simulate_dataset(n, size):
    """
    simulate sequences from different classes ( up to 4 : repetitive, repetitive with stem loops, random, only stem loop
    :param n: number of sequences to simulate
    :param size: the size of each sequence
    :return: a data frame containing a sequence, entropy and joint entropy, together with a type indicating the class
    """

    sequences = []
    cluster = []

    # repetitive sequences
    for i in tqdm(range(n)):
        cluster_name = 'Repetitive'
        seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.6, 0.2, 0.1, 0.1], size=size))
        sequences.append(seq)
        cluster.append(cluster_name)

    # create a data frame with all information
    df_rep = pd.DataFrame({'sequence':sequences, 'cluster':cluster})
    df_rep['entropy'] = df_rep['sequence'].apply(lambda x: entropy_by_kmer(x,5))
    df_rep['joint_entropy'] = df_rep['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5))

    # normalize bpth entropy and joint entropy to 0-1
    df_rep['entropy'] = df_rep['entropy'] /  df_rep['entropy'].max()
    df_rep['joint_entropy'] = df_rep['joint_entropy'] / df_rep['joint_entropy'].max()


    sequences = []
    cluster = []
    # repetitive sequences + structure - generate a perfect stem loop
    for i in tqdm(range(n)):
        cluster_name = 'Repetitive + Stem loop'
        seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.6, 0.2, 0.1, 0.1], size=size//2))
        seq = seq + str(get_reverse_complement(seq))
        sequences.append(seq)
        cluster.append(cluster_name)

    # create a data frame with all information
    df_rep_st = pd.DataFrame({'sequence':sequences, 'cluster':cluster})
    df_rep_st['entropy'] = df_rep_st['sequence'].apply(lambda x: entropy_by_kmer(x,5))
    df_rep_st['joint_entropy'] = df_rep_st['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5))

    # normalize bpth entropy and joint entropy to 0-1
    df_rep_st['entropy'] = df_rep_st['entropy'] /  df_rep_st['entropy'].max()
    df_rep_st['joint_entropy'] = df_rep_st['joint_entropy'] / df_rep_st['joint_entropy'].max()

    sequences = []
    cluster = []
    # only structure - generate a perfect stem loop
    for i in tqdm(range(n)):
        cluster_name = 'Stem loop'
        seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.25, 0.25, 0.25, 0.25], size=size//2))
        seq = seq + str(get_reverse_complement(seq))
        sequences.append(seq)
        cluster.append(cluster_name)

    # create a data frame with all information
    df_st = pd.DataFrame({'sequence':sequences, 'cluster':cluster})
    df_st['entropy'] = df_st['sequence'].apply(lambda x: entropy_by_kmer(x,5))
    df_st['joint_entropy'] = df_st['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5))

    # normalize bpth entropy and joint entropy to 0-1
    df_st['entropy'] = df_st['entropy'] /  df_st['entropy'].max()
    df_st['joint_entropy'] = df_st['joint_entropy'] / df_st['joint_entropy'].max()

    sequences = []
    cluster = []

    # random
    for i in tqdm(range(n)):
        cluster_name = 'Random'
        seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.25, 0.25, 0.25, 0.25], size=size))
        seq = seq + str(get_reverse_complement(seq))
        sequences.append(seq)
        cluster.append(cluster_name)

    # create a data frame with all information
    df_rand = pd.DataFrame({'sequence': sequences, 'cluster': cluster})
    df_rand['entropy'] = df_rand['sequence'].apply(lambda x: entropy_by_kmer(x, 5))
    df_rand['joint_entropy'] = df_rand['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5))

    # normalize bpth entropy and joint entropy to 0-1
    df_rand['entropy'] = df_rand['entropy'] /  df_rand['entropy'].max()
    df_rand['joint_entropy'] = df_rand['joint_entropy'] / df_rand['joint_entropy'].max()


    # combine all inputs to one df, and return it

    result = pd.concat([df_rep, df_rep_st, df_st, df_rand])
    return result