def count_gc(file, sheet, index_column): ''' counts the combined number of G and C nucleotides in the DNA sequence -------------------------------- Intput file: the excel file containing the list of genes to be analysed sheet: the sheet of the excel file that is to be read index_column: the column within the excel sheet to be used as the index for the data Returns gc_percentage: the percentage of nucleotides that are either G or C ''' gc_count = 0 total_count = 0 df = pd.read_excel(file, sheetname=sheet, index_col=index_column) symbol = df['Symbol'].copy() for chromosome_id in symbol: seq = useful.pull_fasta_sequence(chromosome_id) seq = useful.clean_seq(seq) seq = Seq.Seq(seq, Seq.Alphabet.generic_dna) for nucleotide in seq: if nucleotide == 'G' or nucleotide == 'C': gc_count += 1 total_count += 1 else: total_count += 1 gc_percentage = (gc_count / total_count) * 100 return gc_percentage
def interpret_rscu(file, sheet): ''' computes the rscu values of all codons for all genes in the dataset ----------------------------- Input: file: the excel file containing the dataset sheet: the specific worksheet within the excel file to be accessed Returns: rscu_values: a dictionary containing the rscu values for all codons associated to every amino acid ''' count_dict = codon_dict.copy() df = pd.read_excel(file, sheetname=sheet, index_col=None) symbol = df['Symbol'].copy() for chromosome_id in symbol: seq = useful.pull_fasta_sequence(chromosome_id) seq = useful.clean_seq(seq) seq = Seq.Seq(seq, Seq.Alphabet.generic_dna) seq = seq.transcribe() updated_count = count_codon(seq) count_dict = Counter(count_dict) + Counter(updated_count) rscu_values = get_rscu_value(count_dict) return rscu_values
def interpret(file, sheet): ''' computes the rscu values of all codons for all genes in the dataset the fold change is calculated by taking the avergae difference in the expression and dividing it by the overall minimum intensity the values in the codon dictionary are multiplied by the fold change before calculating the rscu values, providing a more representative rscu value ----------------------------- Input: file: the excel file containing the dataset sheet: the specific worksheet within the excel file to be accessed Returns: rscu_values: a dictionary containing the rscu values for all codons associated to every amino acid ''' count_dict = codon_dict.copy() df = pd.read_excel(file, sheetname = sheet , index_col = None) symbol = df['Symbol'].copy() ######### find minimum and use as baseline for foldchange ######### # minimum_intensity = df['Average'].min() - standardise lowest value across all datasets to be compared minimum_intensity = 3.650557279586792 for ids in range(len(symbol)): print (symbol[ids]) seq = useful.pull_fasta_sequence(symbol[ids]) seq = useful.clean_seq(seq) seq = Seq.Seq(seq, Seq.Alphabet.generic_dna) seq = seq.transcribe() updated_count = count_codon(seq) fold_change = df['Average'][ids] / minimum_intensity updated_count.update((k, v * fold_change) for k, v in updated_count.items()) count_dict = Counter(count_dict) + Counter(updated_count) rscu_values = get_rscu_value(count_dict) return rscu_values
def interpret(file, sheet): ''' computes the rscu values of all codons for all genes in the dataset the fold change is accounted for and incorporated into the codon count to produce a more representative rscu value ----------------------------- Input: file: the excel file containing the dataset sheet: the specific worksheet within the excel file to be accessed Returns: rscu_values: a dictionary containing the rscu values for all codons associated to every amino acid ''' count_dict = codon_dict.copy() df = pd.read_excel(file, sheetname=sheet, index_col=None) symbol = df['Symbol'].copy() fold_change = df['FC'].copy() if fold_change.min() < 0: fold_change = fold_change * (-1.0) else: pass for id in range(len(symbol)): print(symbol[id]) seq = useful.pull_fasta_sequence(symbol[id]) seq = useful.clean_seq(seq) seq = Seq.Seq(seq, Seq.Alphabet.generic_dna) seq = seq.transcribe() updated_count = count_codon(seq) updated_count.update( (k, v * fold_change[id]) for k, v in updated_count.items()) count_dict = Counter(count_dict) + Counter(updated_count) rscu_values = get_rscu_value(count_dict) return rscu_values
def test_clean_seq(self): sequence = 'aacggttaa' self.assertEqual(useful.clean_seq(sequence), 'AACGGTTAA', msg = 'Error: does not utilise .upper()') sequence = 'aaggttddaatt' self.assertEqual(useful.clean_seq(sequence), 'AAGGTTAATT', msg = 'Error: does not remove non nucleotide letters') self.assertEqual(useful.clean_seq(' '), '', msg = 'Error: does not recognises spaces to skip')
def update_dict(gene_id): ''' isolates every codon in a DNA sequence and returns a dictionary containing the count of each codon -------------------------------- Input: gene_id: The Gene_id of the target sequence Returns: codon_dict: a dictionary containing the codon composition of the gene ''' codon_dict = { 'AUA': 0, 'AUC': 0, 'AUU': 0, 'AUG': 0, 'ACA': 0, 'ACC': 0, 'ACG': 0, 'ACU': 0, 'AAC': 0, 'AAU': 0, 'AAA': 0, 'AAG': 0, 'AGC': 0, 'AGU': 0, 'AGA': 0, 'AGG': 0, 'CUA': 0, 'CUC': 0, 'CUG': 0, 'CUU': 0, 'CCA': 0, 'CCC': 0, 'CCG': 0, 'CCU': 0, 'CAC': 0, 'CAU': 0, 'CAA': 0, 'CAG': 0, 'CGA': 0, 'CGC': 0, 'CGG': 0, 'CGU': 0, 'GUA': 0, 'GUC': 0, 'GUG': 0, 'GUU': 0, 'GCA': 0, 'GCC': 0, 'GCG': 0, 'GCU': 0, 'GAC': 0, 'GAU': 0, 'GAA': 0, 'GAG': 0, 'GGA': 0, 'GGC': 0, 'GGG': 0, 'GGU': 0, 'UCA': 0, 'UCC': 0, 'UCG': 0, 'UCU': 0, 'UUC': 0, 'UUU': 0, 'UUA': 0, 'UUG': 0, 'UAC': 0, 'UAU': 0, 'UGC': 0, 'UGU': 0, 'UGG': 0 } seq = useful.pull_fasta_sequence(gene_id) seq = useful.clean_seq(seq) seq = Seq.Seq(seq, Seq.Alphabet.generic_dna) seq = seq.transcribe() start_pos = useful.get_start(seq) stop_pos = useful.get_stop(seq) for j in range(start_pos + 3, stop_pos - 2, 3): for key in codon_dict: if seq[j:j + 3] == key: codon_dict[key] += 1 return codon_dict
def update_dict(file, sheet): ''' updates a dictionary of all codon triplets with the number of times that the triplet appears in a dataset and normalises the values - standard normalisation in this scenario is to divide by 1000 ------------------------------ Input: file: the file containing worksheets of the upregulated and downregulated genes, sheet: the specific sheet within the excel file to be used for analysis output: codon_dict: a dictionary containing the codon frequency per 1000 codons ''' codon_dict = { 'AUA': 0, 'AUC': 0, 'AUU': 0, 'AUG': 0, 'ACA': 0, 'ACC': 0, 'ACG': 0, 'ACU': 0, 'AAC': 0, 'AAU': 0, 'AAA': 0, 'AAG': 0, 'AGC': 0, 'AGU': 0, 'AGA': 0, 'AGG': 0, 'CUA': 0, 'CUC': 0, 'CUG': 0, 'CUU': 0, 'CCA': 0, 'CCC': 0, 'CCG': 0, 'CCU': 0, 'CAC': 0, 'CAU': 0, 'CAA': 0, 'CAG': 0, 'CGA': 0, 'CGC': 0, 'CGG': 0, 'CGU': 0, 'GUA': 0, 'GUC': 0, 'GUG': 0, 'GUU': 0, 'GCA': 0, 'GCC': 0, 'GCG': 0, 'GCU': 0, 'GAC': 0, 'GAU': 0, 'GAA': 0, 'GAG': 0, 'GGA': 0, 'GGC': 0, 'GGG': 0, 'GGU': 0, 'UCA': 0, 'UCC': 0, 'UCG': 0, 'UCU': 0, 'UUC': 0, 'UUU': 0, 'UUA': 0, 'UUG': 0, 'UAC': 0, 'UAU': 0, 'UGC': 0, 'UGU': 0, 'UGG': 0 } df = pd.read_excel(file, sheetname=sheet, index_col=None) if df.iloc[0]['FC'] > 0: df = df.nlargest(n=250, columns=['FC']) else: df = df.nsmallest(n=250, columns=['FC']) symbol = df['Symbol'].copy() for gene_id in symbol: seq = useful.pull_fasta_sequence(gene_id) seq = useful.clean_seq(seq) seq = Seq.Seq(seq, Seq.Alphabet.generic_dna) seq = seq.transcribe() start_pos = useful.get_start(seq) stop_pos = useful.get_stop(seq) for j in range(start_pos + 3, stop_pos - 2, 3): for key in codon_dict: if seq[j:j + 3] == key: codon_dict[key] += 1 codon_dict.update((k, v / 1000.0) for k, v in codon_dict.items()) codon_dict.update((k, round(v, 3)) for k, v in codon_dict.items()) return codon_dict