Esempio n. 1
0
def percentages(file, sheet, codon):
    '''
    computes the percentage frequency of a set of predetermined codons 
    
    for only the 10 most up (or down) regulated genes in the dataset 
    
    ------------------------------
    Input: 
        file: the excel file name 
        
        sheet: and worksheet of the excel file containing the Gene IDs for each gene 
            
        codons:
            also requires the codon or list of codons to be analysed
    
    Returns: produces a DataFrame containing the additional cloumns of the 
             percentage frequency for each Codon examined
    '''
    df = pd.read_excel(file, sheetname=sheet, index_col=None)
    print(df)
    if df.iloc[0]['FC'] > 0:
        df = df.nlargest(n=10, columns=['FC'])
    else:
        df = df.nsmallest(n=10, columns=['FC'])
    symbol = df['Symbol'].copy()
    for codon in codons:
        codon_percent = []
        for gene_id in symbol:
            seq = useful.pull_fasta_sequence(gene_id)
            percentage = useful.codon_percentage(seq, codon)
            codon_percent.append(percentage)
        df[codon] = codon_percent
    print(df)
    return df
Esempio n. 2
0
def percentages(file, sheet, codons):
    '''
    computes the percentage frequency of a set of predetermined codons for each gene in a dataset 
    
    ------------------------------
    Input: 
        file: the excel file name 
        
        sheet: and worksheet of the excel file containing the Gene IDs for each gene 
            
        codons:
            also requires the codon or list of codons to be analysed
    
    Returns: produces a DataFrame containing the additional cloumns of the 
             percentage frequency for each Codon examined
    '''

    df = pd.read_excel(file, sheet_name = sheet, index_col = None)
    symbol = df['Symbol'].copy()
    
    for codon in codons:
        codon_percent = []
        for gene_id in symbol:
            seq = useful.pull_fasta_sequence(gene_id)
            percentage = useful.codon_percentage(seq, codon)
            codon_percent.append(percentage)
        df[codon] = codon_percent
        
    return df
Esempio n. 3
0
def count_gc(file, sheet, index_column):
    '''
    counts the combined number of G and C nucleotides in the DNA sequence 
    
    --------------------------------
    Intput 
        file: the excel file containing the list of genes to be analysed 
        
        sheet: the sheet of the excel file that is to be read 
        
        index_column: the column within the excel sheet to be used as the index for the data 
        
    Returns 
        gc_percentage: the percentage of nucleotides that are either G or C
    '''
    gc_count = 0
    total_count = 0

    df = pd.read_excel(file, sheetname=sheet, index_col=index_column)
    symbol = df['Symbol'].copy()

    for chromosome_id in symbol:
        seq = useful.pull_fasta_sequence(chromosome_id)
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)

        for nucleotide in seq:
            if nucleotide == 'G' or nucleotide == 'C':
                gc_count += 1
                total_count += 1
            else:
                total_count += 1

    gc_percentage = (gc_count / total_count) * 100
    return gc_percentage
Esempio n. 4
0
def interpret_rscu(file, sheet):
    '''
    computes the rscu values of all codons for all genes in the dataset
    
    -----------------------------
    Input:
        file: the excel file containing the dataset 
        
        sheet: the specific worksheet within the excel file to be accessed 
    
    Returns:
        rscu_values: a dictionary containing the rscu values for all codons associated to every amino acid 
    '''
    count_dict = codon_dict.copy()

    df = pd.read_excel(file, sheetname=sheet, index_col=None)
    symbol = df['Symbol'].copy()

    for chromosome_id in symbol:
        seq = useful.pull_fasta_sequence(chromosome_id)
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
        seq = seq.transcribe()
        updated_count = count_codon(seq)
        count_dict = Counter(count_dict) + Counter(updated_count)

    rscu_values = get_rscu_value(count_dict)
    return rscu_values
Esempio n. 5
0
def interpret(file, sheet):
    '''
    computes the rscu values of all codons for all genes in the dataset 
    
    the fold change is calculated by taking the avergae difference in the expression and dividing it by the overall minimum intensity 
    
    the values in the codon dictionary are multiplied by the fold change before calculating the rscu values,
    
    providing a more representative rscu value
    
    -----------------------------
    Input:
        file: the excel file containing the dataset 
        
        sheet: the specific worksheet within the excel file to be accessed 
    
    Returns:
        rscu_values: a dictionary containing the rscu values for all codons associated to every amino acid 
    '''
    
    count_dict = codon_dict.copy()
    
    df = pd.read_excel(file, sheetname = sheet , index_col = None)
    symbol = df['Symbol'].copy()
    
    ######### find minimum and use as baseline for foldchange #########
    # minimum_intensity = df['Average'].min() - standardise lowest value across all datasets to be compared
    minimum_intensity = 3.650557279586792
    
    
    for ids in range(len(symbol)):
        print (symbol[ids])
        seq = useful.pull_fasta_sequence(symbol[ids])
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
        seq = seq.transcribe()  
        updated_count = count_codon(seq)
        
        fold_change = df['Average'][ids] / minimum_intensity

        updated_count.update((k, v * fold_change) for k, v in updated_count.items())
        count_dict = Counter(count_dict) + Counter(updated_count)
        
    
    rscu_values = get_rscu_value(count_dict)  
    return rscu_values
Esempio n. 6
0
def interpret(file, sheet):
    '''
    computes the rscu values of all codons for all genes in the dataset 
    
    the fold change is accounted for and incorporated into the codon count to produce a more representative rscu value
    
    -----------------------------
    Input:
        file: the excel file containing the dataset 
        
        sheet: the specific worksheet within the excel file to be accessed 
    
    Returns:
        rscu_values: a dictionary containing the rscu values for all codons associated to every amino acid 
    '''
    count_dict = codon_dict.copy()

    df = pd.read_excel(file, sheetname=sheet, index_col=None)
    symbol = df['Symbol'].copy()
    fold_change = df['FC'].copy()

    if fold_change.min() < 0:
        fold_change = fold_change * (-1.0)
    else:
        pass

    for id in range(len(symbol)):
        print(symbol[id])
        seq = useful.pull_fasta_sequence(symbol[id])
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
        seq = seq.transcribe()
        updated_count = count_codon(seq)

        updated_count.update(
            (k, v * fold_change[id]) for k, v in updated_count.items())
        count_dict = Counter(count_dict) + Counter(updated_count)

    rscu_values = get_rscu_value(count_dict)
    return rscu_values
Esempio n. 7
0
    def test_pull_fasta_sequence(self):
        gene_id = 'NM_029459'
        self.assertEqual(useful.pull_fasta_sequence(gene_id), 'AGGGACCGTTGAGGGGCAGCTTCCACCAAAGACTATGGCACGCCCACCACCTCGAACTCCTCTCCAGAAA\
TGAACGACTAACACTGCTGAGGGAGTGGAAAAGTTAAAAAAAATAAGAAGAAAGGAAAAAAAAGAAAGAA\
AAGAAAAAAAACGAAAAGAAAATCTCCAAGTCCGCCCACTTCTTCATTACATCCTTTCACTCCCTCTTCC\
AAGATTTATATTCCTGAACCATTCGTGGTGGCAATTCCTGGGCTGTGAGAGGAATTTCGAGGTCTGCGTC\
AACTGGGATAACGAAAGTGGACATTATTTCAAATATTTCATTGAATTTGATCAGCGTTTTTCCATAGTCT\
CATCCAGAGAGATAGATCTTCACTGGATTCACAACTCAGACACATTTGAAGATTCTTGTAGAGCATCTGT\
GAGAGGAAGGAGGCTGCTGCAACCTAAGGCCTTTGTGGGTCTGGAACTCAGGAATCTCAGTTTCTGCAAT\
CTTGACCCTTACTGAAGTGCACCGGTTCCAGGAACCCATCAAATTTGAGTGATATTTGAAAGCCCTTTGC\
TACGGTGAGATTAATGAAGAGCTGTCCTGTATTCTTCAAGAAGCTGGTAATATTTATTTCAGTCAGCAAG\
TTATTCTAAACAAGAACAGTGTCTGAGTGGCAAGTTATTCTATCCAAGGACGGTGTCTGAGTGTGTACCG\
GCTAATAGTAAAGTTCCCTAAACTAGGTTTATGATGATGGATGAAAGAGACCCATCCTCGCTTTTGGATC\
TGGCTATACAGAGTCTACTAAGTAATGAGCTTGTAGCAATTCATTCTCTGGGGGAGATCCCAAGGGAGCT\
TTTTGTTCCATTGTTCTCTGCTGCCTTCACGGGAGGATATAGGAAGATACTGACTTCAATGGTGAAGATT\
TGGCCTTTTACCTGTCTCCACATTGGAACATTAAGTGTACAGGAACCCCAGCGTGAACTCCTGAAAGCCA\
TGGTTGAGAGTCTTCAGTTTCTTCCTGCCCAGGACTGTTCTTCTGGGGGCCCTAAGTTGAGGATCCTAGA\
TGTAAGGCAGGGTGTTGACTGCAAGACAACATGCCCTGATTTTGGTGCCAGATCTCCAACTTGTTTTCAT\
GGTTGTACTCACTCTGTACACTCTATTCTGAAGTTAGAAAGCCAGTACAGCATTGTAGATCTAAAGCCCG\
AGAGTCAGTCTGCAATCCAGCCTATGGAACTACTAGTAGACCTTTCCCTTGATGGTACCTTGAGAGAAAG\
GGAATTTTTTGCTTTGCTTCTGAATAAAGTACAGCAGAGCTCAGGGTCTTTGCACCTCTGCTGCCGAGAT\
CTACAAATTGATAGATTTTCTTATGCCAAAAACGCTCTGAAGTTCCTCGATCTAACTTGCATTCAGAACC\
TGACAGTTGATCAGGCTTCACTGAGTGAAGTCACCACTCTTCTGGCTCGCATGATCTATCTGGACAGCCT\
GAGTCTCTCTAAAATCACTTATAGATCTTTGCATGGGAAAGTCTTCCGAGTGTTCCTCAACTATCTTGGG\
CGGATGAACTGCCTGAAAGAGCTCAACCTGTCTTCCTTTAGCCTCACAGACCATCTGGATAGCCTCCTCA\
GAGCCTTACCACCTAATTTGGATTTCTTGTATCTGCCGTTCTGTGAAATTTCTTACAGAGATCTCAAATT\
TCTATCCCAGAGTGCTCAGGCCACCCACTTAAAGCTGTTGAATCTTAGTAACAACCCAATGTATTGGGAT\
GATTGTGGGCCTTTTCAGACTCTTTTGCAGAAGCTCTCAGATACCTTGCAGCATCTGGCCATAAACCATT\
GCCATTTAACAGATGCCATACTCTCTGCTATTCTGCCAGCACTATCTAAGTGTTCCCATCTCCGTGTGAT\
TAGCTTTGTCTCTAACCCCATTTCAATGCCTATGCTCCTGAAAATTCTTCATTACTTAACACCTTTGATG\
GAGCTGAAATACGTGATTTACCCTATCCCTATACATTGCTATGAACAATGGCAATTTCATGGCAGATTAG\
ACCGGCAGAAGCTCACCGATGTCCAAGCACAACTGAAGGCAATGCTACAAGCAGCAAAAAGGAGTGACAT\
GAACTGGATCACTTATTCTCAGTAAACTTCCAAGTTTAACTCCATCTCAAGCTCCAAATTTGACCTGTTA\
TCTGTTCAATGTTCTTTTCTCGAGCTTCAAGAATCTGATGTAAGAATTCGTACGTTATAGACGATTAAAG\
TTAGAAACTGATCAAAAACATTAACTC', msg = 'Error: output not equal to sequence')
        self.assertNotEqual(useful.pull_fasta_sequence(gene_id), 'AGGGACCGTTGAGGGGCAGCTTCCACCAAAGACTATGGCACGCCCACCACCTCGAACTCCTCTCCAGAAA\
TGAACGACTAACACTGCTGAGGGAGTGGAAAAGTTAAAAAAAATAAGAAGAAAGGAAAAAAAAGAAAGAA\
AAGAAAAAAAACGAAAAGAAAATCTCCAAGTCCGCCCACTTCTTCATTACATCCTTTCACTCCCTCTTCC\
AAGATTTATATTCCTGAACCATTCGTGGTGGCAATTCCTGGGCTGTGAGAGGAATTTCGAGGTCTGCGTC\
AACTGGGATAACGAAAGTGGACATTATTTCAAATATTTCATTGAATTTGATCAGCGTTTTTCCATAGTCT\
CATCCAGAGAGATAGATCTTCACTGGATTCACAACTCAGACACATTTGAAGATTCTTGTAGAGCATCTGT\
GAGAGGAAGGAGGCTGCTGCAACCTAAGGCCTTTGTGGGTCTGGAACTCAGGAATCTCAGTTTCTGCAAT\
CTTGACCCTTACTGAAGTGCACCGGTTCCAGGAACCCATCAAATTTGAGTGATATTTGAAAGCCCTTTGC\
TACGGTGAGATTAATGAAGAGCTGTCCTGTATTCTTCAAGAAGCTGGTAATATTTATTTCAGTCAGCAAG\
TTATTCTAAACAAGAACAGTGTCTGAGTGGCAAGTTATTCTATCCAAGGACGGTGTCTGAGTGTGTACCG\
GCTAATAGTAAAGTTCCCTAAACTAGGTTTATGATGATGGATGAAAGAGACCCATCCTCGCTTTTGGATC\
TGGCTATACAGAGTCTACTAAGTAATGAGCTTGTAGCAATTCATTCTCTGGGGGAGATCCCAAGGGAGCT\
TTTTGTTCCATTGTTCTCTGCTGCCTTCACGGGAGGATATAGGAAGATACTGACTTCAATGGTGAAGATT\
TGGCCTTTTACCTGTCTCCACATTGGAACATTAAGTGTACAGGAACCCCAGCGTGAACTCCTGAAAGCCA\
TGGTTGAGAGTCTTCAGTTTCTTCCTGCCCAGGACTGTTCTTCTGGGGGCCCTAAGTTGAGGATCCTAGA\
TGTAAGGCAGGGTGTTGACTGCAAGACAACATGCCCTGATTTTGGTGCCAGATCTCCAACTTGTTTTCAT\
GGTTGTACTCACTCTGTACACTCTATTCTGAAGTTAGAAAGCCAGTACAGCATTGTAGATCTAAAGCCCG\
AGAGTCAGTCTGCAATCCAGCCTATGGAACTACTAGTAGACCTTTCCCTTGATGGTACCTTGAGAGAAAG\
GGAATTTTTTGCTTTGCTTCTGAATAAAGTACAGCAGAGCTCAGGGTCTTTGCACCTCTGCTGCCGAGAT\
CTACAAATTGATAGATTTTCTTATGCCAAAAACGCTCTGAAGTTCCTCGATCTAACTTGCATTCAGAACC\
TGACAGTTGATCAGGCTTCACTGAGTGAAGTCACCACTCTTCTGGCTCGCATGATCTATCTGGACAGCCT\
GAGTCTCTCTAAAATCACTTATAGATCTTTGCATGGGAAAGTCTTCCGAGTGTTCCTCAACTATCTTGGG\
CGGATGAACTGCCTGAAAGAGCTCAACCTGTCTTCCTTTAGCCTCACAGACCATCTGGATAGCCTCCTCA\
GAGCCTTACCACCTAATTTGGATTTCTTGTATCTGCCGTTCTGTGAAATTTCTTACAGAGATCTCAAATT\
TCTATCCCAGAGTGCTCAGGCCACCCACTTAAAGCTGTTGAATCTTAGTAACAACCCAATGTATTGGGAT\
GATTGTGGGCCTTTTCAGACTCTTTTGCAGAAGCTCTCAGATACCTTGCAGCATCTGGCCATAAACCATT\
GCCATTTAACAGATGCCATACTCTCTGCTATTCTGCCAGCACTATCTAAGTGTTCCCATCTCCGTGTGAT\
TAGCTTTGTCTCTAACCCCATTTCAATGCCTATGCTCCTGAAAATTCTTCATTACTTAACACCTTTGATG\
GAGCTGAAATACGTGATTTACCCTATCCCTATACATTGCTATGAACAATGGCAATTTCATGGCAGATTAG\
ACCGGCAGAAGCTCACCGATGTCCAAGCACAACTGAAGGCAATGCTACAAGCAGCAAAAAGGAGTGACAT\
GAACTGGATCACTTATTCTCAGTAAACTTCCAAGTTTAACTCCATCTCAAGCTCCAAATTTGACCTGTTA\
TCTGTTCAATGTTCTTTTCTCGAGCTTCAAGAATCTGATGTAAGAATTCGTACGTTATAGACGATTAAAG\
TTAGAAACTGATCAAAAACATTAAAAA', msg = 'Error: does not recognise different sequences')
Esempio n. 8
0
file_out = 'cl_individual_sets_mock.xlsx'
file_name = 'Cell_lines_individual_datasets.xlsx'
sheet = 'Mock'

remove = []
df = pd.read_excel(io=file_name, sheetname=sheet)

#df = df.drop(['Unique ID'], axis = 1)
symbol = df['Symbol'].copy()
df = df.set_index('Symbol')
'''
for every gene in the dataset this will attempt to pull the fasta file
if an error occurs accessing the file, the gene will be added to a remove list
and removed from the dataset

the genes are removed because not all of the gene IDs in the dataset will be endogenous to the species 
as some are addded during the preceding experiments - these will not have fasta sequences on the NCBI database
'''
for i in symbol:
    try:
        useful.pull_fasta_sequence(i)
    except:
        remove.append(i)

df = df.drop(remove)

with pd.ExcelWriter(file_out, engine='xlsxwriter') as writer:
    df.to_excel(writer)
    writer.save()
    writer.close()
def update_dict(gene_id):
    '''
    isolates every codon in a DNA sequence and returns a dictionary containing the count of each codon
    
    --------------------------------
    Input: 
        gene_id: 
            The Gene_id of the target sequence
    
    Returns: 
        codon_dict:
            a dictionary containing the codon composition of the gene
    '''
    codon_dict = {
        'AUA': 0,
        'AUC': 0,
        'AUU': 0,
        'AUG': 0,
        'ACA': 0,
        'ACC': 0,
        'ACG': 0,
        'ACU': 0,
        'AAC': 0,
        'AAU': 0,
        'AAA': 0,
        'AAG': 0,
        'AGC': 0,
        'AGU': 0,
        'AGA': 0,
        'AGG': 0,
        'CUA': 0,
        'CUC': 0,
        'CUG': 0,
        'CUU': 0,
        'CCA': 0,
        'CCC': 0,
        'CCG': 0,
        'CCU': 0,
        'CAC': 0,
        'CAU': 0,
        'CAA': 0,
        'CAG': 0,
        'CGA': 0,
        'CGC': 0,
        'CGG': 0,
        'CGU': 0,
        'GUA': 0,
        'GUC': 0,
        'GUG': 0,
        'GUU': 0,
        'GCA': 0,
        'GCC': 0,
        'GCG': 0,
        'GCU': 0,
        'GAC': 0,
        'GAU': 0,
        'GAA': 0,
        'GAG': 0,
        'GGA': 0,
        'GGC': 0,
        'GGG': 0,
        'GGU': 0,
        'UCA': 0,
        'UCC': 0,
        'UCG': 0,
        'UCU': 0,
        'UUC': 0,
        'UUU': 0,
        'UUA': 0,
        'UUG': 0,
        'UAC': 0,
        'UAU': 0,
        'UGC': 0,
        'UGU': 0,
        'UGG': 0
    }

    seq = useful.pull_fasta_sequence(gene_id)
    seq = useful.clean_seq(seq)
    seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
    seq = seq.transcribe()
    start_pos = useful.get_start(seq)
    stop_pos = useful.get_stop(seq)
    for j in range(start_pos + 3, stop_pos - 2, 3):
        for key in codon_dict:
            if seq[j:j + 3] == key:
                codon_dict[key] += 1

    return codon_dict
Esempio n. 10
0
def update_dict(file, sheet):
    '''
    updates a dictionary of all codon triplets with the number of times that the triplet appears in a dataset 
    and normalises the values - standard normalisation in this scenario is to divide by 1000
    
    ------------------------------
    Input: 
        file: the file containing worksheets of the upregulated and downregulated genes, 
        
        sheet: the specific sheet within the excel file to be used for analysis 
    
    output: 
        codon_dict: a dictionary containing the codon frequency per 1000 codons
    '''

    codon_dict = {
        'AUA': 0,
        'AUC': 0,
        'AUU': 0,
        'AUG': 0,
        'ACA': 0,
        'ACC': 0,
        'ACG': 0,
        'ACU': 0,
        'AAC': 0,
        'AAU': 0,
        'AAA': 0,
        'AAG': 0,
        'AGC': 0,
        'AGU': 0,
        'AGA': 0,
        'AGG': 0,
        'CUA': 0,
        'CUC': 0,
        'CUG': 0,
        'CUU': 0,
        'CCA': 0,
        'CCC': 0,
        'CCG': 0,
        'CCU': 0,
        'CAC': 0,
        'CAU': 0,
        'CAA': 0,
        'CAG': 0,
        'CGA': 0,
        'CGC': 0,
        'CGG': 0,
        'CGU': 0,
        'GUA': 0,
        'GUC': 0,
        'GUG': 0,
        'GUU': 0,
        'GCA': 0,
        'GCC': 0,
        'GCG': 0,
        'GCU': 0,
        'GAC': 0,
        'GAU': 0,
        'GAA': 0,
        'GAG': 0,
        'GGA': 0,
        'GGC': 0,
        'GGG': 0,
        'GGU': 0,
        'UCA': 0,
        'UCC': 0,
        'UCG': 0,
        'UCU': 0,
        'UUC': 0,
        'UUU': 0,
        'UUA': 0,
        'UUG': 0,
        'UAC': 0,
        'UAU': 0,
        'UGC': 0,
        'UGU': 0,
        'UGG': 0
    }

    df = pd.read_excel(file, sheetname=sheet, index_col=None)

    if df.iloc[0]['FC'] > 0:
        df = df.nlargest(n=250, columns=['FC'])
    else:
        df = df.nsmallest(n=250, columns=['FC'])

    symbol = df['Symbol'].copy()

    for gene_id in symbol:
        seq = useful.pull_fasta_sequence(gene_id)
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
        seq = seq.transcribe()
        start_pos = useful.get_start(seq)
        stop_pos = useful.get_stop(seq)

        for j in range(start_pos + 3, stop_pos - 2, 3):
            for key in codon_dict:
                if seq[j:j + 3] == key:
                    codon_dict[key] += 1

    codon_dict.update((k, v / 1000.0) for k, v in codon_dict.items())
    codon_dict.update((k, round(v, 3)) for k, v in codon_dict.items())
    return codon_dict