Python clean_seqの例、useful.clean_seq Pythonの例

コード例 #1

0

ファイルを表示

def count_gc(file, sheet, index_column):
    '''
    counts the combined number of G and C nucleotides in the DNA sequence 
    
    --------------------------------
    Intput 
        file: the excel file containing the list of genes to be analysed 
        
        sheet: the sheet of the excel file that is to be read 
        
        index_column: the column within the excel sheet to be used as the index for the data 
        
    Returns 
        gc_percentage: the percentage of nucleotides that are either G or C
    '''
    gc_count = 0
    total_count = 0

    df = pd.read_excel(file, sheetname=sheet, index_col=index_column)
    symbol = df['Symbol'].copy()

    for chromosome_id in symbol:
        seq = useful.pull_fasta_sequence(chromosome_id)
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)

        for nucleotide in seq:
            if nucleotide == 'G' or nucleotide == 'C':
                gc_count += 1
                total_count += 1
            else:
                total_count += 1

    gc_percentage = (gc_count / total_count) * 100
    return gc_percentage

コード例 #2

0

ファイルを表示

def interpret_rscu(file, sheet):
    '''
    computes the rscu values of all codons for all genes in the dataset
    
    -----------------------------
    Input:
        file: the excel file containing the dataset 
        
        sheet: the specific worksheet within the excel file to be accessed 
    
    Returns:
        rscu_values: a dictionary containing the rscu values for all codons associated to every amino acid 
    '''
    count_dict = codon_dict.copy()

    df = pd.read_excel(file, sheetname=sheet, index_col=None)
    symbol = df['Symbol'].copy()

    for chromosome_id in symbol:
        seq = useful.pull_fasta_sequence(chromosome_id)
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
        seq = seq.transcribe()
        updated_count = count_codon(seq)
        count_dict = Counter(count_dict) + Counter(updated_count)

    rscu_values = get_rscu_value(count_dict)
    return rscu_values

コード例 #3

0

ファイルを表示

ファイル: rscu_all_genes.py プロジェクト: 06fsantos/internship

def interpret(file, sheet):
    '''
    computes the rscu values of all codons for all genes in the dataset 
    
    the fold change is calculated by taking the avergae difference in the expression and dividing it by the overall minimum intensity 
    
    the values in the codon dictionary are multiplied by the fold change before calculating the rscu values,
    
    providing a more representative rscu value
    
    -----------------------------
    Input:
        file: the excel file containing the dataset 
        
        sheet: the specific worksheet within the excel file to be accessed 
    
    Returns:
        rscu_values: a dictionary containing the rscu values for all codons associated to every amino acid 
    '''
    
    count_dict = codon_dict.copy()
    
    df = pd.read_excel(file, sheetname = sheet , index_col = None)
    symbol = df['Symbol'].copy()
    
    ######### find minimum and use as baseline for foldchange #########
    # minimum_intensity = df['Average'].min() - standardise lowest value across all datasets to be compared
    minimum_intensity = 3.650557279586792
    
    
    for ids in range(len(symbol)):
        print (symbol[ids])
        seq = useful.pull_fasta_sequence(symbol[ids])
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
        seq = seq.transcribe()  
        updated_count = count_codon(seq)
        
        fold_change = df['Average'][ids] / minimum_intensity

        updated_count.update((k, v * fold_change) for k, v in updated_count.items())
        count_dict = Counter(count_dict) + Counter(updated_count)
        
    
    rscu_values = get_rscu_value(count_dict)  
    return rscu_values

コード例 #4

0

ファイルを表示

ファイル: rscu_fc.py プロジェクト: 06fsantos/internship

def interpret(file, sheet):
    '''
    computes the rscu values of all codons for all genes in the dataset 
    
    the fold change is accounted for and incorporated into the codon count to produce a more representative rscu value
    
    -----------------------------
    Input:
        file: the excel file containing the dataset 
        
        sheet: the specific worksheet within the excel file to be accessed 
    
    Returns:
        rscu_values: a dictionary containing the rscu values for all codons associated to every amino acid 
    '''
    count_dict = codon_dict.copy()

    df = pd.read_excel(file, sheetname=sheet, index_col=None)
    symbol = df['Symbol'].copy()
    fold_change = df['FC'].copy()

    if fold_change.min() < 0:
        fold_change = fold_change * (-1.0)
    else:
        pass

    for id in range(len(symbol)):
        print(symbol[id])
        seq = useful.pull_fasta_sequence(symbol[id])
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
        seq = seq.transcribe()
        updated_count = count_codon(seq)

        updated_count.update(
            (k, v * fold_change[id]) for k, v in updated_count.items())
        count_dict = Counter(count_dict) + Counter(updated_count)

    rscu_values = get_rscu_value(count_dict)
    return rscu_values

コード例 #5

0

ファイルを表示

ファイル: useful_test.py プロジェクト: 06fsantos/internship

 def test_clean_seq(self):
     sequence = 'aacggttaa'
     self.assertEqual(useful.clean_seq(sequence), 'AACGGTTAA', msg = 'Error: does not utilise .upper()')
     sequence = 'aaggttddaatt'
     self.assertEqual(useful.clean_seq(sequence), 'AAGGTTAATT', msg = 'Error: does not remove non nucleotide letters')
     self.assertEqual(useful.clean_seq(' '), '', msg = 'Error: does not recognises spaces to skip')

コード例 #6

0

ファイルを表示

ファイル: sequence_composition.py プロジェクト: 06fsantos/internship

def update_dict(gene_id):
    '''
    isolates every codon in a DNA sequence and returns a dictionary containing the count of each codon
    
    --------------------------------
    Input: 
        gene_id: 
            The Gene_id of the target sequence
    
    Returns: 
        codon_dict:
            a dictionary containing the codon composition of the gene
    '''
    codon_dict = {
        'AUA': 0,
        'AUC': 0,
        'AUU': 0,
        'AUG': 0,
        'ACA': 0,
        'ACC': 0,
        'ACG': 0,
        'ACU': 0,
        'AAC': 0,
        'AAU': 0,
        'AAA': 0,
        'AAG': 0,
        'AGC': 0,
        'AGU': 0,
        'AGA': 0,
        'AGG': 0,
        'CUA': 0,
        'CUC': 0,
        'CUG': 0,
        'CUU': 0,
        'CCA': 0,
        'CCC': 0,
        'CCG': 0,
        'CCU': 0,
        'CAC': 0,
        'CAU': 0,
        'CAA': 0,
        'CAG': 0,
        'CGA': 0,
        'CGC': 0,
        'CGG': 0,
        'CGU': 0,
        'GUA': 0,
        'GUC': 0,
        'GUG': 0,
        'GUU': 0,
        'GCA': 0,
        'GCC': 0,
        'GCG': 0,
        'GCU': 0,
        'GAC': 0,
        'GAU': 0,
        'GAA': 0,
        'GAG': 0,
        'GGA': 0,
        'GGC': 0,
        'GGG': 0,
        'GGU': 0,
        'UCA': 0,
        'UCC': 0,
        'UCG': 0,
        'UCU': 0,
        'UUC': 0,
        'UUU': 0,
        'UUA': 0,
        'UUG': 0,
        'UAC': 0,
        'UAU': 0,
        'UGC': 0,
        'UGU': 0,
        'UGG': 0
    }

    seq = useful.pull_fasta_sequence(gene_id)
    seq = useful.clean_seq(seq)
    seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
    seq = seq.transcribe()
    start_pos = useful.get_start(seq)
    stop_pos = useful.get_stop(seq)
    for j in range(start_pos + 3, stop_pos - 2, 3):
        for key in codon_dict:
            if seq[j:j + 3] == key:
                codon_dict[key] += 1

    return codon_dict

コード例 #7

0

ファイルを表示

def update_dict(file, sheet):
    '''
    updates a dictionary of all codon triplets with the number of times that the triplet appears in a dataset 
    and normalises the values - standard normalisation in this scenario is to divide by 1000
    
    ------------------------------
    Input: 
        file: the file containing worksheets of the upregulated and downregulated genes, 
        
        sheet: the specific sheet within the excel file to be used for analysis 
    
    output: 
        codon_dict: a dictionary containing the codon frequency per 1000 codons
    '''

    codon_dict = {
        'AUA': 0,
        'AUC': 0,
        'AUU': 0,
        'AUG': 0,
        'ACA': 0,
        'ACC': 0,
        'ACG': 0,
        'ACU': 0,
        'AAC': 0,
        'AAU': 0,
        'AAA': 0,
        'AAG': 0,
        'AGC': 0,
        'AGU': 0,
        'AGA': 0,
        'AGG': 0,
        'CUA': 0,
        'CUC': 0,
        'CUG': 0,
        'CUU': 0,
        'CCA': 0,
        'CCC': 0,
        'CCG': 0,
        'CCU': 0,
        'CAC': 0,
        'CAU': 0,
        'CAA': 0,
        'CAG': 0,
        'CGA': 0,
        'CGC': 0,
        'CGG': 0,
        'CGU': 0,
        'GUA': 0,
        'GUC': 0,
        'GUG': 0,
        'GUU': 0,
        'GCA': 0,
        'GCC': 0,
        'GCG': 0,
        'GCU': 0,
        'GAC': 0,
        'GAU': 0,
        'GAA': 0,
        'GAG': 0,
        'GGA': 0,
        'GGC': 0,
        'GGG': 0,
        'GGU': 0,
        'UCA': 0,
        'UCC': 0,
        'UCG': 0,
        'UCU': 0,
        'UUC': 0,
        'UUU': 0,
        'UUA': 0,
        'UUG': 0,
        'UAC': 0,
        'UAU': 0,
        'UGC': 0,
        'UGU': 0,
        'UGG': 0
    }

    df = pd.read_excel(file, sheetname=sheet, index_col=None)

    if df.iloc[0]['FC'] > 0:
        df = df.nlargest(n=250, columns=['FC'])
    else:
        df = df.nsmallest(n=250, columns=['FC'])

    symbol = df['Symbol'].copy()

    for gene_id in symbol:
        seq = useful.pull_fasta_sequence(gene_id)
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
        seq = seq.transcribe()
        start_pos = useful.get_start(seq)
        stop_pos = useful.get_stop(seq)

        for j in range(start_pos + 3, stop_pos - 2, 3):
            for key in codon_dict:
                if seq[j:j + 3] == key:
                    codon_dict[key] += 1

    codon_dict.update((k, v / 1000.0) for k, v in codon_dict.items())
    codon_dict.update((k, round(v, 3)) for k, v in codon_dict.items())
    return codon_dict