Beispiel #1
def count_gc(file, sheet, index_column):
    counts the combined number of G and C nucleotides in the DNA sequence 
        file: the excel file containing the list of genes to be analysed 
        sheet: the sheet of the excel file that is to be read 
        index_column: the column within the excel sheet to be used as the index for the data 
        gc_percentage: the percentage of nucleotides that are either G or C
    gc_count = 0
    total_count = 0

    df = pd.read_excel(file, sheetname=sheet, index_col=index_column)
    symbol = df['Symbol'].copy()

    for chromosome_id in symbol:
        seq = useful.pull_fasta_sequence(chromosome_id)
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)

        for nucleotide in seq:
            if nucleotide == 'G' or nucleotide == 'C':
                gc_count += 1
                total_count += 1
                total_count += 1

    gc_percentage = (gc_count / total_count) * 100
    return gc_percentage
Beispiel #2
def interpret_rscu(file, sheet):
    computes the rscu values of all codons for all genes in the dataset
        file: the excel file containing the dataset 
        sheet: the specific worksheet within the excel file to be accessed 
        rscu_values: a dictionary containing the rscu values for all codons associated to every amino acid 
    count_dict = codon_dict.copy()

    df = pd.read_excel(file, sheetname=sheet, index_col=None)
    symbol = df['Symbol'].copy()

    for chromosome_id in symbol:
        seq = useful.pull_fasta_sequence(chromosome_id)
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
        seq = seq.transcribe()
        updated_count = count_codon(seq)
        count_dict = Counter(count_dict) + Counter(updated_count)

    rscu_values = get_rscu_value(count_dict)
    return rscu_values
Beispiel #3
def interpret(file, sheet):
    computes the rscu values of all codons for all genes in the dataset 
    the fold change is calculated by taking the avergae difference in the expression and dividing it by the overall minimum intensity 
    the values in the codon dictionary are multiplied by the fold change before calculating the rscu values,
    providing a more representative rscu value
        file: the excel file containing the dataset 
        sheet: the specific worksheet within the excel file to be accessed 
        rscu_values: a dictionary containing the rscu values for all codons associated to every amino acid 
    count_dict = codon_dict.copy()
    df = pd.read_excel(file, sheetname = sheet , index_col = None)
    symbol = df['Symbol'].copy()
    ######### find minimum and use as baseline for foldchange #########
    # minimum_intensity = df['Average'].min() - standardise lowest value across all datasets to be compared
    minimum_intensity = 3.650557279586792
    for ids in range(len(symbol)):
        print (symbol[ids])
        seq = useful.pull_fasta_sequence(symbol[ids])
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
        seq = seq.transcribe()  
        updated_count = count_codon(seq)
        fold_change = df['Average'][ids] / minimum_intensity

        updated_count.update((k, v * fold_change) for k, v in updated_count.items())
        count_dict = Counter(count_dict) + Counter(updated_count)
    rscu_values = get_rscu_value(count_dict)  
    return rscu_values
Beispiel #4
def interpret(file, sheet):
    computes the rscu values of all codons for all genes in the dataset 
    the fold change is accounted for and incorporated into the codon count to produce a more representative rscu value
        file: the excel file containing the dataset 
        sheet: the specific worksheet within the excel file to be accessed 
        rscu_values: a dictionary containing the rscu values for all codons associated to every amino acid 
    count_dict = codon_dict.copy()

    df = pd.read_excel(file, sheetname=sheet, index_col=None)
    symbol = df['Symbol'].copy()
    fold_change = df['FC'].copy()

    if fold_change.min() < 0:
        fold_change = fold_change * (-1.0)

    for id in range(len(symbol)):
        seq = useful.pull_fasta_sequence(symbol[id])
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
        seq = seq.transcribe()
        updated_count = count_codon(seq)

            (k, v * fold_change[id]) for k, v in updated_count.items())
        count_dict = Counter(count_dict) + Counter(updated_count)

    rscu_values = get_rscu_value(count_dict)
    return rscu_values
Beispiel #5
 def test_clean_seq(self):
     sequence = 'aacggttaa'
     self.assertEqual(useful.clean_seq(sequence), 'AACGGTTAA', msg = 'Error: does not utilise .upper()')
     sequence = 'aaggttddaatt'
     self.assertEqual(useful.clean_seq(sequence), 'AAGGTTAATT', msg = 'Error: does not remove non nucleotide letters')
     self.assertEqual(useful.clean_seq(' '), '', msg = 'Error: does not recognises spaces to skip')
def update_dict(gene_id):
    isolates every codon in a DNA sequence and returns a dictionary containing the count of each codon
            The Gene_id of the target sequence
            a dictionary containing the codon composition of the gene
    codon_dict = {
        'AUA': 0,
        'AUC': 0,
        'AUU': 0,
        'AUG': 0,
        'ACA': 0,
        'ACC': 0,
        'ACG': 0,
        'ACU': 0,
        'AAC': 0,
        'AAU': 0,
        'AAA': 0,
        'AAG': 0,
        'AGC': 0,
        'AGU': 0,
        'AGA': 0,
        'AGG': 0,
        'CUA': 0,
        'CUC': 0,
        'CUG': 0,
        'CUU': 0,
        'CCA': 0,
        'CCC': 0,
        'CCG': 0,
        'CCU': 0,
        'CAC': 0,
        'CAU': 0,
        'CAA': 0,
        'CAG': 0,
        'CGA': 0,
        'CGC': 0,
        'CGG': 0,
        'CGU': 0,
        'GUA': 0,
        'GUC': 0,
        'GUG': 0,
        'GUU': 0,
        'GCA': 0,
        'GCC': 0,
        'GCG': 0,
        'GCU': 0,
        'GAC': 0,
        'GAU': 0,
        'GAA': 0,
        'GAG': 0,
        'GGA': 0,
        'GGC': 0,
        'GGG': 0,
        'GGU': 0,
        'UCA': 0,
        'UCC': 0,
        'UCG': 0,
        'UCU': 0,
        'UUC': 0,
        'UUU': 0,
        'UUA': 0,
        'UUG': 0,
        'UAC': 0,
        'UAU': 0,
        'UGC': 0,
        'UGU': 0,
        'UGG': 0

    seq = useful.pull_fasta_sequence(gene_id)
    seq = useful.clean_seq(seq)
    seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
    seq = seq.transcribe()
    start_pos = useful.get_start(seq)
    stop_pos = useful.get_stop(seq)
    for j in range(start_pos + 3, stop_pos - 2, 3):
        for key in codon_dict:
            if seq[j:j + 3] == key:
                codon_dict[key] += 1

    return codon_dict
Beispiel #7
def update_dict(file, sheet):
    updates a dictionary of all codon triplets with the number of times that the triplet appears in a dataset 
    and normalises the values - standard normalisation in this scenario is to divide by 1000
        file: the file containing worksheets of the upregulated and downregulated genes, 
        sheet: the specific sheet within the excel file to be used for analysis 
        codon_dict: a dictionary containing the codon frequency per 1000 codons

    codon_dict = {
        'AUA': 0,
        'AUC': 0,
        'AUU': 0,
        'AUG': 0,
        'ACA': 0,
        'ACC': 0,
        'ACG': 0,
        'ACU': 0,
        'AAC': 0,
        'AAU': 0,
        'AAA': 0,
        'AAG': 0,
        'AGC': 0,
        'AGU': 0,
        'AGA': 0,
        'AGG': 0,
        'CUA': 0,
        'CUC': 0,
        'CUG': 0,
        'CUU': 0,
        'CCA': 0,
        'CCC': 0,
        'CCG': 0,
        'CCU': 0,
        'CAC': 0,
        'CAU': 0,
        'CAA': 0,
        'CAG': 0,
        'CGA': 0,
        'CGC': 0,
        'CGG': 0,
        'CGU': 0,
        'GUA': 0,
        'GUC': 0,
        'GUG': 0,
        'GUU': 0,
        'GCA': 0,
        'GCC': 0,
        'GCG': 0,
        'GCU': 0,
        'GAC': 0,
        'GAU': 0,
        'GAA': 0,
        'GAG': 0,
        'GGA': 0,
        'GGC': 0,
        'GGG': 0,
        'GGU': 0,
        'UCA': 0,
        'UCC': 0,
        'UCG': 0,
        'UCU': 0,
        'UUC': 0,
        'UUU': 0,
        'UUA': 0,
        'UUG': 0,
        'UAC': 0,
        'UAU': 0,
        'UGC': 0,
        'UGU': 0,
        'UGG': 0

    df = pd.read_excel(file, sheetname=sheet, index_col=None)

    if df.iloc[0]['FC'] > 0:
        df = df.nlargest(n=250, columns=['FC'])
        df = df.nsmallest(n=250, columns=['FC'])

    symbol = df['Symbol'].copy()

    for gene_id in symbol:
        seq = useful.pull_fasta_sequence(gene_id)
        seq = useful.clean_seq(seq)
        seq = Seq.Seq(seq, Seq.Alphabet.generic_dna)
        seq = seq.transcribe()
        start_pos = useful.get_start(seq)
        stop_pos = useful.get_stop(seq)

        for j in range(start_pos + 3, stop_pos - 2, 3):
            for key in codon_dict:
                if seq[j:j + 3] == key:
                    codon_dict[key] += 1

    codon_dict.update((k, v / 1000.0) for k, v in codon_dict.items())
    codon_dict.update((k, round(v, 3)) for k, v in codon_dict.items())
    return codon_dict