class GeneLocus(core.PolymerLocus): """ Knowledge of a gene Attributes: symbol (:obj:`str`): symbol Related attributes: proteins (:obj:`list` of :obj:`ProteinSpeciesType`): protein """ symbol = obj_tables.StringAttribute() start = obj_tables.IntegerAttribute() end = obj_tables.IntegerAttribute() is_essential = obj_tables.BooleanAttribute() proteins = obj_tables.OneToOneAttribute(ProteinSpeciesType, related_name='gene') homologs = obj_tables.LongStringAttribute() evidence = obj_tables.OneToManyAttribute(core.Evidence, related_name='genes') cog = obj_tables.sci.onto.OntoTermAttribute( kbOnt, terms=kbOnt['WC:COG'].subclasses(), none=True) class Meta(obj_tables.Model.Meta): verbose_name = 'Gene' verbose_name_plural = 'Genes' attribute_order = ('id', 'name', 'synonyms', 'symbol', 'polymer', 'start', 'end', 'cog', 'homologs', 'is_essential', 'proteins', 'evidence', 'identifiers', 'references', 'comments')
class Company(obj_tables.Model): name = obj_tables.StringAttribute(primary=True, unique=True, verbose_name='Name') url = obj_tables.UrlAttribute(verbose_name='URL') address = obj_tables.OneToOneAttribute('Address', related_name='company', verbose_name='Address') class Meta(obj_tables.Model.Meta): table_format = obj_tables.TableFormat.column attribute_order = ( 'name', 'url', 'address', ) verbose_name = 'Company' verbose_name_plural = 'Companies'
class Gene(obj_tables.Model): id = obj_tables.StringAttribute(primary=True, unique=True, verbose_name='Id') symbol = obj_tables.StringAttribute(verbose_name='Symbol') location = obj_tables.OneToOneAttribute('Location', related_name='genes', verbose_name='Location') class Meta(obj_tables.Model.Meta): table_format = obj_tables.TableFormat.row attribute_order = ( 'id', 'symbol', 'location', ) verbose_name = 'Gene' verbose_name_plural = 'Genes'
class Transcript(obj_tables.Model): id = obj_tables.StringAttribute(primary=True, unique=True, verbose_name='Id') gene = obj_tables.ManyToOneAttribute('Gene', related_name='transcripts', verbose_name='Gene') location = obj_tables.OneToOneAttribute('Location', related_name='transcripts', verbose_name='Location') class Meta(obj_tables.Model.Meta): table_format = obj_tables.TableFormat.row attribute_order = ( 'id', 'gene', 'location', ) verbose_name = 'Transcript' verbose_name_plural = 'Transcripts'
class Person(obj_tables.Model): name = obj_tables.StringAttribute(primary=True, unique=True, verbose_name='Name') type = obj_tables.EnumAttribute(['family', 'friend', 'business'], verbose_name='Type') company = obj_tables.ManyToOneAttribute('Company', related_name='employees', verbose_name='Company') email_address = obj_tables.EmailAttribute(verbose_name='Email address') phone_number = obj_tables.StringAttribute(verbose_name='Phone number') address = obj_tables.OneToOneAttribute('Address', related_name='person', verbose_name='Address') class Meta(obj_tables.Model.Meta): table_format = obj_tables.TableFormat.row attribute_order = ( 'name', 'type', 'company', 'email_address', 'phone_number', 'address', ) verbose_name = 'Person' verbose_name_plural = 'People'
class ProteinSpeciesType(core.PolymerSpeciesType): """ Knowledge of a protein monomer Attributes: uniprot (:obj:`str`): uniprot id transcript (:obj:`TranscriptSpeciesType`): transcript coding_regions (:obj:`list` of :obj:`LocusAttribute`): CDS coordinates Related attributes: transcription_factor_regulation (:obj:`list` of `TranscriptionFactorRegulation`): transcription factor regulation ptm_sites (:obj:list` of `PtmSite`): protein modification sites """ uniprot = obj_tables.StringAttribute() transcript = obj_tables.OneToOneAttribute(TranscriptSpeciesType, related_name='protein') coding_regions = LocusAttribute(related_name='proteins') class Meta(obj_tables.Model.Meta): verbose_name = 'Protein' verbose_name_plural = 'Proteins' attribute_order = ('id', 'name', 'uniprot', 'transcript', 'coding_regions', 'identifiers', 'references', 'comments') def get_seq(self, table=1, cds=True): """ Get the 5' to 3' sequence Args: table (:obj:`int`, optional): NCBI identifier for translation table (default = standard table) cds (:obj:`bool`, optional): True indicates the sequence is a complete CDS Returns: :obj:`Bio.Seq.Seq`: sequence """ ordered_cds = sorted(self.coding_regions, key=lambda x: x.start) dna_seq = self.transcript.gene.polymer.get_subseq( start=ordered_cds[0].start, end=ordered_cds[-1].end) adjusted_cds = [(i.start - ordered_cds[0].start, i.end - ordered_cds[0].start + 1) \ for i in ordered_cds] spliced_dna_seq = Bio.Seq.Seq('', alphabet=Bio.Alphabet.DNAAlphabet()) for i in adjusted_cds: spliced_dna_seq += dna_seq[i[0]:i[1]] if self.transcript.gene.strand == core.PolymerStrand.negative: spliced_dna_seq = spliced_dna_seq.reverse_complement() return spliced_dna_seq.transcribe().translate(table=table, cds=cds) def get_seq_and_start_codon(self, table=1, cds=True): """ Get the 5' to 3' amino acid sequence and the start codon Args: table (:obj:`int`, optional): NCBI identifier for translation table (default = standard table) cds (:obj:`bool`, optional): True indicates the sequence is a complete CDS Returns: :obj:`Bio.Seq.Seq`: coding RNA sequence that will be translated :obj:`Bio.Seq.Seq`: amino acid sequence :obj:`Bio.Seq.Seq`: start codon """ ordered_cds = sorted(self.coding_regions, key=lambda x: x.start) dna_seq = self.transcript.gene.polymer.get_subseq( start=ordered_cds[0].start, end=ordered_cds[-1].end) adjusted_cds = [(i.start - ordered_cds[0].start, i.end - ordered_cds[0].start + 1) \ for i in ordered_cds] spliced_dna_seq = Bio.Seq.Seq('', alphabet=Bio.Alphabet.DNAAlphabet()) for i in adjusted_cds: spliced_dna_seq += dna_seq[i[0]:i[1]] if self.transcript.gene.strand == core.PolymerStrand.negative: spliced_dna_seq = spliced_dna_seq.reverse_complement() coding_rna_seq = spliced_dna_seq.transcribe() protein_seq = coding_rna_seq.translate(table=table, cds=cds) start_codon_index = 0 for aa_seq in protein_seq: if aa_seq == '*': start_codon_index += 3 else: break start_codon = coding_rna_seq[start_codon_index:start_codon_index + 3] return coding_rna_seq, protein_seq, start_codon def get_empirical_formula(self, table=1, cds=True, seq_input=None): """ Get the empirical formula Args: table (:obj:`int`, optional): NCBI identifier for translation table (default = standard table) cds (:obj:`bool`, optional): True indicates the sequence is a complete CDS seq_input (:obj:`Bio.Seq.Seq`, optional): if provided, the method will use it instead of reading from fasta file to reduce IO operation Returns: :obj:`chem.EmpiricalFormula`: empirical formula """ if seq_input: seq = seq_input else: seq = self.get_seq(table=table, cds=cds) l = len(seq) - seq.count('*') n_a = seq.count('A') # Ala: Alanine (C3 H7 N O2) n_r = seq.count('R') # Arg: Arginine (C6 H14 N4 O2) n_n = seq.count('N') # Asn: Asparagine (C4 H8 N2 O3) n_d = seq.count('D') # Asp: Aspartic acid (C4 H7 N O4) n_c = seq.count('C') # Cys: Cysteine (C3 H7 N O2 S) n_q = seq.count('Q') # Gln: Glutamine (C5 H10 N2 O3) n_e = seq.count('E') # Glu: Glutamic acid (C5 H9 N O4) n_g = seq.count('G') # Gly: Glycine (C2 H5 N O2) n_h = seq.count('H') # His: Histidine (C6 H9 N3 O2) n_i = seq.count('I') # Ile: Isoleucine (C6 H13 N O2) n_l = seq.count('L') # Leu: Leucine (C6 H13 N O2) n_k = seq.count('K') # Lys: Lysine (C6 H14 N2 O2) n_m = seq.count('M') # Met: Methionine (C5 H11 N O2 S) n_f = seq.count('F') # Phe: Phenylalanine (C9 H11 N O2) n_p = seq.count('P') # Pro: Proline (C5 H9 N O2) n_s = seq.count('S') # Ser: Serine (C3 H7 N O3) n_t = seq.count('T') # Thr: Threonine (C4 H9 N O3) n_w = seq.count('W') # Trp: Tryptophan (C11 H12 N2 O2) n_y = seq.count('Y') # Tyr: Tyrosine (C9 H11 N O3) n_v = seq.count('V') # Val: Valine (C5 H11 N O2) n_u = seq.count('U') # Selcys: Selenocysteine (C3 H7 N O2 Se) formula = chem.EmpiricalFormula() formula.C = 3 * n_a + 6 * n_r + 4 * n_n + 4 * n_d + 3 * n_c + \ 5 * n_q + 5 * n_e + 2 * n_g + 6 * n_h + 6 * n_i + \ 6 * n_l + 6 * n_k + 5 * n_m + 9 * n_f + 5 * n_p + \ 3 * n_s + 4 * n_t + 11 * n_w + 9 * n_y + 5 * n_v + \ 3 * n_u formula.H = 7 * n_a + 14 * n_r + 8 * n_n + 7 * n_d + 7 * n_c + \ 10 * n_q + 9 * n_e + 5 * n_g + 9 * n_h + 13 * n_i + \ 13 * n_l + 14 * n_k + 11 * n_m + 11 * n_f + 9 * n_p + \ 7 * n_s + 9 * n_t + 12 * n_w + 11 * n_y + 11 * n_v + \ 7 * n_u - 2 * (l - 1) formula.N = 1 * n_a + 4 * n_r + 2 * n_n + 1 * n_d + 1 * n_c + \ 2 * n_q + 1 * n_e + 1 * n_g + 3 * n_h + 1 * n_i + \ 1 * n_l + 2 * n_k + 1 * n_m + 1 * n_f + 1 * n_p + \ 1 * n_s + 1 * n_t + 2 * n_w + 1 * n_y + 1 * n_v + \ 1 * n_u formula.O = 2 * n_a + 2 * n_r + 3 * n_n + 4 * n_d + 2 * n_c + \ 3 * n_q + 4 * n_e + 2 * n_g + 2 * n_h + 2 * n_i + \ 2 * n_l + 2 * n_k + 2 * n_m + 2 * n_f + 2 * n_p + \ 3 * n_s + 3 * n_t + 2 * n_w + 3 * n_y + 2 * n_v + \ 2 * n_u - (l - 1) formula.S = n_c + n_m formula.Se = n_u return formula def get_charge(self, table=1, cds=True, seq_input=None): """ Get the charge at physiological pH Args: table (:obj:`int`, optional): NCBI identifier for translation table (default = standard table) cds (:obj:`bool`, optional): True indicates the sequence is a complete CDS seq_input (:obj:`Bio.Seq.Seq`, optional): if provided, the method will use it instead of reading from fasta file to reduce IO operation Returns: :obj:`int`: charge """ if seq_input: seq = seq_input else: seq = self.get_seq(table=table, cds=cds) n_r = seq.count('R') n_h = seq.count('H') n_k = seq.count('K') n_d = seq.count('D') n_e = seq.count('E') return (n_r + n_h + n_k) - (n_d + n_e) def get_mol_wt(self, table=1, cds=True, seq_input=None): """ Get the molecular weight Args: table (:obj:`int`, optional): NCBI identifier for translation table (default = standard table) cds (:obj:`bool`, optional): True indicates the sequence is a complete CDS seq_input (:obj:`Bio.Seq.Seq`, optional): if provided, the method will use it instead of reading from fasta file to reduce IO operation Returns: :obj:`float`: molecular weight """ if seq_input: return self.get_empirical_formula( table=table, cds=cds, seq_input=seq_input).get_molecular_weight() else: return self.get_empirical_formula(table=table, cds=cds).get_molecular_weight()