def test_EMBL_CCDS_RefSeq(self): exp = [ CodingSequence( 'CR456855', 'EMBL', Seq( 'ATGGAGGGTCAACGCTGGCTGCCGCTGGAGGCCAATCCCGAGGTCACCAACCAGTTTCTTAAACAATTAGGTCTACATCCTAACTGGCAATTCGTTGATGTATATGGAATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCTTAA', IUPACUnambiguousDNA()), Seq( 'MEGQRWLPLEANPEVTNQFLKQLGLHPNWQFVDVYGMDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA', ExtendedIUPACProtein())), CodingSequence( 'DQ917642', 'EMBL', Seq( 'ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGAGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGCTGATTTTTCTTTTCAAGTGGCAGCCCGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCCAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTAAGTGTGTTATTGAACTGTACCCATCAGGATGTCCATTTAGGAGAGACATTGTCAGAGTTTAAGGAATTCTCACAAAGTTTTGATGCAGCTATGAAAGGTTTGGCCCTGAGTAATTCGGATGTGATTCGCCAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATGCAAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTACGTTCCTGTGAATGGAAGACTGTACGAATTAGATGGATTAAGAGAAGGACCGATCGATTTAGGTGCATGCAATCAAGATGACTGGATCAGCGCAGTGAGGCCAGTCATAGAAAAAAGGATACAAAAGTACAGTGAAGGTGAAATTCGATTTAACTTAATGGCCATTGTGTCTGACAGGAAAATGATATATGAACAGAAGATAGCAGAGTTACAAAGACAGCTTGCTGAGGAGGAACCCATGGATACAGATCAGGGTAGTAACATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATATAAGATTGAAAACATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAACTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCTCTCGTAGAAAAGGCAAAAGAAAAACAGAATGCGAAGAAGGCACAGGAAACCAAATGA', IUPACUnambiguousDNA()), Seq( 'MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDAKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEEPMDTDQGSNMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK', ExtendedIUPACProtein())), CodingSequence( 'NM_001270952', 'RefSeq', Seq( 'ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG', IUPACUnambiguousDNA()), Seq( 'MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA', ExtendedIUPACProtein())), CodingSequence( 'CCDS73586.1', 'CCDS', Seq('ATGGATCCTGAACTCCTTAGCATGGTACCAAGACCAGTCTGTGCAGTCTTACTTCTCTTTCCTATTACAGAAAAGTATGAAGTATTCAGAACAGAAGAGGAAGAAAAAATAAAATCTCAGGGACAAGATGTTACATCATCAGTATATTTCATGAAGCAAACAATCAGCAATGCCTGTGGAACAATTGGACTGATTCATGCTATTGCAAACAATAAAGACAAGATGCACTTTGAATCTGGATCAACCTTGAAAAAATTCCTGGAGGAATCTGTGTCAATGAGCCCTGAAGAACGAGCCAGATACCTGGAGAACTATGATGCCATCCGAGTTACTCATGAGACCAGTGCCCATGAAGGTCAGACTGAGGCACCAAGTATAGATGAGAAAGTAGATCTTCATTTTATTGCATTAGTTCATGTAGATGGGCATCTCTATGAATTAGATGGGCGGAAGCCATTTCCAATTAACCATGGTGAAACTAGTGATGAAACTTTATTAGAGGATGCCATAGAAGTTTGCAAGAAGTTTATGGAGCGCGACCCTGATGAACTAAGATTTAATGCGATTGCTCTTTCTGCAGCATAG' ), Seq('MDPELLSMVPRPVCAVLLLFPITEKYEVFRTEEEEKIKSQGQDVTSSVYFMKQTISNACGTIGLIHAIANNKDKMHFESGSTLKKFLEESVSMSPEERARYLENYDAIRVTHETSAHEGQTEAPSIDEKVDLHFIALVHVDGHLYELDGRKPFPINHGETSDETLLEDAIEVCKKFMERDPDELRFNAIALSAA' )), CodingSequence( 'CCDS86041.1', 'CCDS', Seq('ATGACGGGCAATGCCGGGGAGTGGTGCCTCATGGAAAGCGACCCCGGGGTCTTCACCGAGCTCATTAAAGGATTCGGTTGCCGAGGAGCCCAAGTAGAAGAAATATGGAGTTTAGAGCCTGAGAATTTTGAAAAATTAAAGCCAGTTCATGGGTTAATTTTTCTTTTCAAGTGGCAGCCAGGAGAAGAACCAGCAGGCTCTGTGGTTCAGGACTCCCGACTTGACACGATATTTTTTGCTAAGCAGGTAATTAATAATGCTTGTGCTACTCAAGCCATAGTGAGTGTGTTACTGAACTGTACCCACCAGGATGTCCATTTAGGCGAGACATTATCAGAGTTTAAAGAATTTTCACAAAGTTTTGATGCAGCTATGAAAGGCTTGGCACTGAGCAATTCAGATGTGATTCGACAAGTACACAACAGTTTCGCCAGACAGCAAATGTTTGAATTTGATACGAAGACATCAGCAAAAGAAGAAGATGCTTTTCACTTTGTCAGTTATGTTCCTGTTAATGGGAGACTGTATGAATTAGATGGATTAAGAGAAGGACCGATTGATTTAGGTGCATGCAATCAAGATGATTGGATCAGTGCAGTAAGGCCTGTCATAGAAAAAAGGATACAAAAAGACGGGTTTTCACCATGTTGCCCAGGCTGGTCTCAGACTCCTGAGCTCAAGCCATCCGCCTGCCTCGACCTCCCAAAGTGGTACAGTGAAGGTGAAATTCGATTTAATTTAATGGCCATTGTGTCTGACAGAAAAATGATATATGAGCAGAAGATAGCAGAGTTACAAAGACAACTTGCAGAGGAACCCATGGATACAGATCAAGGTAATAGTATGTTAAGTGCTATTCAGTCAGAAGTTGCCAAAAATCAGATGCTTATTGAAGAAGAAGTACAGAAATTAAAAAGATACAAGATTGAGAATATCAGAAGGAAGCATAATTATCTGCCTTTCATTATGGAATTGTTAAAGACTTTAGCAGAACACCAGCAGTTAATACCACTAGTAGAAAAGGCAAAAGAAAAACAGAACGCAAAGAAAGCTCAGGAAACCAAATGA' ), Seq('MTGNAGEWCLMESDPGVFTELIKGFGCRGAQVEEIWSLEPENFEKLKPVHGLIFLFKWQPGEEPAGSVVQDSRLDTIFFAKQVINNACATQAIVSVLLNCTHQDVHLGETLSEFKEFSQSFDAAMKGLALSNSDVIRQVHNSFARQQMFEFDTKTSAKEEDAFHFVSYVPVNGRLYELDGLREGPIDLGACNQDDWISAVRPVIEKRIQKDGFSPCCPGWSQTPELKPSACLDLPKWYSEGEIRFNLMAIVSDRKMIYEQKIAELQRQLAEEPMDTDQGNSMLSAIQSEVAKNQMLIEEEVQKLKRYKIENIRRKHNYLPFIMELLKTLAEHQQLIPLVEKAKEKQNAKKAQETK' )) ] ids = { 'EMBL': ['CR456855.1', 'DQ917642.1'], 'RefSeq': ['NM_001270952.1'], 'CCDS': ['CCDS73586.1', 'CCDS86041.1'] } formatter = dba.UrlFormatter() queries = [] for database, id_list in ids.items(): queries += formatter.format(database, id_list) loop = asyncio.get_event_loop() fetcher = dba.Entry_fetcher() entries = loop.run_until_complete(fetcher.fetch_all(queries)) splitter = dba.EntrySplitter() entries = splitter.split(entries) loop.close() parser = dba.DnaParser() res = parser.parse(entries) for item in res: self.assertTrue(item in exp) self.assertEqual(len(exp), len(res))
def REsearch(goi='', goiFile='', mcs='', mcsFile=''): rb = RestrictionBatch(suppliers=[ 'C', 'B', 'E', 'I', 'K', 'J', 'M', 'O', 'N', 'Q', 'S', 'R', 'V', 'Y', 'X' ]) goi = Seq(goi, IUPACUnambiguousDNA()) if goi else read_seq(goiFile) if not goi: raise Exception('Please provide a GOI sequence!') mcs = Seq(mcs, IUPACUnambiguousDNA()) if mcs else read_seq(mcsFile) if not mcs: raise Exception('Please provide a MCS sequence!') result_mcs = rb.search(mcs) result_goi = rb.search(goi) REs = set([e for e in result_mcs.keys() if result_mcs[e]]) - set( [e for e in result_goi.keys() if result_goi[e]]) # ana = Analysis(RestrictionBatch(list(REs)), mcs) # REs_sorted = sorted(REs, key=lambda e: result_mcs[e]) # result = {e: result_mcs[e] for e in REs_sorted} r = [] for e in REs: for site in result_mcs[e]: r.append((str(e), site, "blunt" if e.is_blunt() else e.elucidate(), ' '.join(e.suppl))) r.sort(key=lambda i: i[1]) return r
def pcr( cls, seq: Union[str, Seq, SeqRecord], fwd_padding: str = "", rev_padding: str = "", ) -> "Primers": """Create Primers to amplify a sequence-like object. Args: record: the sequence-like object to amplify via primers Keyword Args: fwd_padding: Additional bp that are added to the 5' end of the FWD primer. Example use case: a restriction enzyme rev_padding: Additional bp that are added to the 5' end of the REV primer. Keep in mind that these are added to the 5' end of the rev primer, so they're the reverse complement bp of the template sequence Returns: A Primers object to amplify the SeqRecord """ template = _get_seq(seq) fwd, rev = primers(template, add_fwd=fwd_padding, add_rev=rev_padding) return Primers( Seq(fwd.seq, alphabet=IUPACUnambiguousDNA()), fwd.tm, Seq(rev.seq, alphabet=IUPACUnambiguousDNA()), rev.tm, )
def primer(side: str) -> Tuple[Seq, float]: """Get the sequence and tm of the FWD or REV primer.""" assert side in ("LEFT", "RIGHT") seq = p3_output[f"PRIMER_{side}_0_SEQUENCE"] seq_tm = p3_output[f"PRIMER_{side}_0_TM"] return Seq(seq, alphabet=IUPACUnambiguousDNA()), float(seq_tm)
def gibson(records: List[SeqRecord], hifi: bool = False) -> Tuple[SeqRecord, List[Primers]]: """Create primers for records for a single Gibson Assembly. Create primers to mutate the records' sequences (after PCR) so they anneal to their neighboring records Args: records: list of records to assemble Keyword Args: hifi: whether to use HiFi DNA assembly Returns: 1. assembled plasmid (SeqRecord) 2. list of primer pairs (same length as records) """ assert records records = [r.upper() for r in records] plasmid = records[0].upper() primers: List[Primers] = [Primers.pcr(records[0])] for i, f1 in enumerate(records): j = (i + 1) % len(records) f2 = records[j] if j != 0: primers.append(Primers.pcr(f2)) # if hifi is false, mismatches is 0 homology, homology_length, mismatch_lengths = _record_homology( str(f1.seq), str(f2.seq), hifi) # remove mismatches up to 10 bp if doing HiFi DNA assembly if hifi and mismatch_lengths: f1_mm_length, f2_mm_length = mismatch_lengths f1.seq = f1.seq[:len(f1.seq) - MAX_HOMOLOGY + f1_mm_length - 1] f2.seq = f2.seq[f2_mm_length:] if homology: # homology already exists between records. plasmid += f2[homology_length:].upper() plasmid.id += f"|{f2.id}" else: # homology does not exist between records, introduce it to primers plasmid += f2.upper() plasmid.id += f"|{f2.id}" _mutate_primers(primers[i], primers[j], MIN_HOMOLOGY // 2) plasmid.id = "+".join(r.id for r in records if r.id != "<unknown id>") plasmid.seq = Seq(str(plasmid.seq.upper()), alphabet=IUPACUnambiguousDNA()) # extend primers in 5' direction to avoid duplicate junctions _fix_duplicate_junctions(records, primers) return plasmid, primers
def parse(self, database, xml_soup): ''' Input: Beautifulsoup(xml) of Genbank entry Output: entry_id, DNA sequence, Protein sequence Raises NotAnORF if DNA does not contain a coding sequence (CDS) ''' is_mrna = bool(xml_soup.find_all('GBSeq_moltype')[0].text.strip() == 'mRNA') is_dna = bool(xml_soup.find_all('GBSeq_moltype')[0].text.strip() == 'cDNA') if not (is_dna or is_mrna): raise NotAnORF id_ = xml_soup.find_all('GBSeq_locus')[0].text.strip() features = xml_soup.find_all('GBFeature') if not features: # entirely unannotated entry - ultra rare raise SequenceNotFoundError found = False for f in features: if f.GBFeature_key.text.strip() == 'CDS': loc = f.find_all('GBFeature_location')[0].text loc = loc #sometimes format is start...pos2,pos3..stop); #if pos2 != pos3, or more intervals are indicated, we ignore this entry if loc.startswith('join('): loc = loc[5:-1].split('..') positions = [] for pos in loc: positions = positions + pos.split(',') #pos2,pos2 positions = list(set(positions)) try: assert len(positions) == 3 start, stop = positions[0], positions[2] except (AssertionError, ValueError): raise NotAnORF #mostly feature location is simply encoded as start..stop; else: start, stop = loc.split('..') if '<' in start or '>' in stop: # start or stop codon not known raise NotAnORF start, stop = int(start), int(stop) found = True break if not found: # not sure this ever happens raise SequenceNotFoundError dna_seq = xml_soup.GBSeq_sequence.text.strip().upper() orf = dna_seq[start-1:stop] try: assert orf.startswith('ATG') assert orf.endswith(('TAA', 'TGA', 'TAG')) assert len(orf)%3 == 0 except AssertionError: raise NotAnORF cds = Seq(orf, IUPACUnambiguousDNA()) return CodingSequence(id_, database, cds, cds.translate(cds=True))
def hustle_bustle(seq): pept_seq = str( Seq("".join(seq), alphabet=IUPACUnambiguousDNA()).translate()) try: first_stop = pept_seq.index("*") prefix = seq[:first_stop * 3] nu_seq = seq[first_stop * 3:] #[1:] #nu_seq.append(seq[first_stop*3:][0]) suffix = truffle_shuffle(nu_seq) prefix.extend(suffix) except ValueError: return (seq) return (prefix)
def parse(self, html_soup): # title: Report for CCDS[id].[version] (current version) #.[version] is optional. #" (current version)" might not be present titlematcher = re.compile(r'Report for CCDS[0-9]*(?:\.[0-9]*)(?:\ \(current version\))?') id_ = html_soup.find_all(string=titlematcher)[0] #find() does not take kwargs idmatcher = r'CCDS[0-9]*(?:.[0-9]*)?' id_ = re.search(idmatcher, id_).group(0) nucleotides = html_soup.find_all('span', {'id':re.compile('n[0-9]+')}) aminoacids = html_soup.find_all('span', {'id':re.compile('p[0-9]+')}) dna_seq = Seq(''.join([nt.text for nt in nucleotides]), IUPACUnambiguousDNA()) aa_seq = Seq(''.join([aa.text for aa in aminoacids]), ExtendedIUPACProtein()) assert aa_seq == dna_seq.translate(cds=True) return CodingSequence(id_, 'CCDS', dna_seq, aa_seq)
def parse_non_eukaryotes(self, database, xml): xml_soup = BeautifulSoup(xml, 'xml') id_ = str(xml_soup.find_all('GBSeq_primary-accession')[0].text).strip() features = xml_soup.find_all('GBFeature') if not features: # entirely unannotated entry - ultra rare raise SequenceNotFoundError found = False #unlike eukaryotes, there are usually multiple CDS per entry, and they #might be on complementary strands... coding_sequences = [] for f in features: if f.GBFeature_key.text.strip() == 'CDS': try: dna_seq = Seq(xml_soup.GBSeq_sequence.text.strip().upper(), IUPACUnambiguousDNA()) #don't move out of loop... except AttributeError: #entry does not actually have a normal sequence (e.g. HOPD_ECOLX) raise SequenceNotFoundError loc = f.find_all('GBFeature_location')[0].text start, stop = loc.split('..') if '<' in start or '>' in stop: # start or stop codon not known continue try: if not 'complement('.upper() in start.upper(): #cds on sense strand start, stop = int(start), int(stop) elif 'complement('.upper() in start.upper(): #cds on other strand #complement([start]..[stop]) start = int(start.split('(')[-1]) stop = int(stop.replace(')', '')) #reverse complement dna and remap dna_seq = dna_seq.reverse_complement() temp = start start = len(dna_seq) - stop +1 stop = len(dna_seq) - temp +1 except ValueError: #some other abstruse way of indicating starts and stops continue orf = dna_seq[start-1:stop] try: protein_seq = orf.translate(table=11, cds=True) #note that we use bacterial codon table coding_sequences.append(CodingSequence(id_, database, orf, protein_seq)) found = True except TranslationError: continue #not a good CDS if not found: # not sure this ever happens raise NotAnORF return coding_sequences
def do_it_all(motif_file, copy_rule1: int = 10, copy_rule2: int = 12, how_many_Ns: int = 1, nresults: int = 1) -> list: """ :param motif_file: a file in a format that i need to come up with :param copy_rule1: pick this many random variants :param copy_rule2: make each possible variant in this many copies :param how_many_Ns: How many N's between motifs? :param nresults: number of motif assemblies to output return: actual results """ # generate de-ambigulated motifs in the right copy numbers motifs = generate_parts_for_cassette(motif_file, copy_rule1, copy_rule2) # shuffle motif positions in the cassette motif_set = set() while len(motif_set) != nresults: motif_set.add(tuple(shuffle_motifs(motifs))) cassette_strs = [] for i, x in enumerate(motif_set): # link with N's cassette_str = SeqRecord(Seq("", IUPACUnambiguousDNA()), id=f"id_cassette_{i+1}", name=f"name_cassette_{i+1}", description=f"metmap generated cassette", annotations={'date': "08-MAR-1983"}) current_pos = 0 for (motif, de_motif) in x: cassette_str += deambigulate_random("N" * how_many_Ns) current_pos += how_many_Ns cassette_str += de_motif cassette_str.features.append( SeqFeature(FeatureLocation(current_pos, current_pos + len(de_motif)), type='misc_binding', qualifiers={'note': motif})) current_pos += len(de_motif) cassette_strs.append(cassette_str) return cassette_strs
def _get_seq_from_surrounding(record, start: int, end: int) -> str: species = record.annotations['reference_species'] chr_num = record.annotations['reference_chromosome_number'] transcript_strand = record.annotations['transcript_strand'] if transcript_strand == -1: seq_end = record.annotations['reference_right_index'] seq = _fetch_seq( species, chr_num, seq_end - end + 1, # Note: ensembl is inclusive range and biopython is exclusive seq_end - start) seq = Seq(seq, IUPACUnambiguousDNA()).reverse_complement() else: seq_start = record.annotations['reference_left_index'] seq = _fetch_seq( species, chr_num, seq_start + start, # Note: ensembl is inclusive range and biopython is exclusive seq_start + end - 1) return seq
class DNA(Polynucleotide): alphabet_dict = {'strict':IUPACUnambiguousDNA(),'permissive':IUPACAmbiguousDNA()} def get_dna(self,*args,**kwargs): as_string = kwargs.pop('as_string',False) x = self.convert_sequence(*args,**kwargs) if as_string: return str(x) return x def get_rna(self,*args,**kwargs): as_string = kwargs.pop('as_string',False) x = self.convert_sequence(*args,**kwargs).transcribe() if as_string: return str(x) return x def get_protein(self,*args,**kwargs): as_string = kwargs.pop('as_string',False) x = self.translate(*args,**kwargs) if as_string: return str(x) return x
#!python3 import pandas as pd import numpy as np import os from Bio import AlignIO from Bio.Alphabet.IUPAC import IUPACUnambiguousDNA from Bio.Data.CodonTable import TranslationError from ete3 import Tree f = os.path.abspath('..') + "/DataEmpirical/Cetacea" phy = AlignIO.read("{0}/datadryad/DATASET_B.phylip".format(f), format="phylip-relaxed", alphabet=IUPACUnambiguousDNA()) print("{0} taxa.".format(len(phy))) taxa = Tree("{0}/rootedtree.nhx".format(f), format=1).get_leaf_names() precision_dict = {} coverage_dict = {} with open("{0}/datadryad/Cetacea_gene_partition.txt".format(f), "r") as gene_partition: for line in gene_partition: name, pos = line.replace("DNA,", "").replace(" ", "").split("=") down, up = pos.split("-") down, up = int(down), int(up) diff = 1 + up - down if diff % 3 != 0: continue sequences = phy[:, down - 1:up] output = phy[:, :0] filtered = [rec for rec in sequences if rec.id in taxa] for pos in range(0, int(diff / 3)): keep_site = True
def __init__(self, promoter_sequences_filename= None): if promoter_sequences_filename is None: promoter_sequences_filename= "promoter_sequences.fasta" fullfile = path.join('Data', promoter_sequences_filename) self.promoter_sequences= SeqIO.to_dict(SeqIO.parse(fullfile, "fasta", alphabet=IUPACUnambiguousDNA())) self.promoter_sequences= cks.change_keys_SGDID(self.promoter_sequences) else: self.promoter_sequences= SeqIO.to_dict(SeqIO.parse(promoter_sequences_filename, "fasta", alphabet=IUPACUnambiguousDNA())) self.promoter_sequences= cks.change_keys_SGDID(self.promoter_sequences)
'T': 'A', 'C': 'G', 'G': 'C', 'X': 'X', 'N': 'N', 'a': 't', 't': 'a', 'c': 'g', 'g': 'c', 'x': 'x', 'n': 'n', '-': '-' } # Shortcut to the list of DNA bases: bases = sorted(list(IUPACUnambiguousDNA().letters)) ambiguous_bases = sorted(list(IUPACAmbiguousDNA().letters)) def base_complement(k): """ Return complement of base. Performs the subsitutions: A<=>T, C<=>G, X=>X for both upper and lower case. The return value is identical to the argument for all other values. :param k: A base. :returns: Complement of base. :rtype: str """ try:
def alignment_summary(inFastaFileOne, inFastaFileTwo, outfileName=None, printCounts=False): """ Write or print pairwise alignment summary information for sequences in two FASTA files, including SNPs, ambiguous bases, and indels. """ gap = '-' ambiguous = 'N' aligner = tools.muscle.MuscleTool() per_chr_fastas = interhost.transposeChromosomeFiles( [inFastaFileOne, inFastaFileTwo]) results = OrderedDict() results["same_unambig"] = 0 results["snp_unambig"] = 0 results["indel_unambig"] = 0 results["indel_ambig"] = 0 results["ambig_one"] = 0 results["ambig_two"] = 0 results["ambig_both"] = 0 results["unambig_both"] = 0 for chr_fasta in per_chr_fastas: same_unambig = 0 snp_unambig = 0 indel_unambig = 0 indel_ambig = 0 ambig_one = 0 ambig_two = 0 ambig_both = 0 unambig_both = 0 alignOutFileName = util.file.mkstempfname('.fasta') aligner.execute(chr_fasta, alignOutFileName, fmt="clw") with open(alignOutFileName, "r") as f: alignment = Bio.AlignIO.read(f, "clustal") for col_idx in range(0, alignment.get_alignment_length()): col = alignment[:, col_idx] c1 = col[0] c2 = col[1] if (c1 in ambiguous and c2 in ambiguous): ambig_both += 1 elif c1 in ambiguous: ambig_one += 1 elif c2 in ambiguous: ambig_two += 1 if (c1 in IUPACUnambiguousDNA().letters and c2 in IUPACUnambiguousDNA().letters): unambig_both += 1 if c1 == c2: same_unambig += 1 else: snp_unambig += 1 if ((c1 == gap and c2 in IUPACUnambiguousDNA().letters) or (c2 == gap and c1 in IUPACUnambiguousDNA().letters)): indel_unambig += 1 if ((c1 == gap and c2 in ambiguous) or (c2 == gap and c1 in ambiguous)): indel_ambig += 1 if printCounts: print("Counts for this segment/chromosome:") print("same_unambig ", same_unambig) print("snp_unambig ", snp_unambig) print("indel_unambig", indel_unambig) print("indel_ambig ", indel_ambig) print("ambig_one ", ambig_one) print("ambig_two ", ambig_two) print("ambig_both ", ambig_both) print("unambig_both ", unambig_both) results["same_unambig"] += same_unambig results["snp_unambig"] += snp_unambig results["indel_unambig"] += indel_unambig results["indel_ambig"] += indel_ambig results["ambig_one"] += ambig_one results["ambig_two"] += ambig_two results["ambig_both"] += ambig_both results["unambig_both"] += unambig_both if printCounts: print("\nCounts for this sample:") print("same_unambig ", results["same_unambig"]) print("snp_unambig ", results["snp_unambig"]) print("indel_unambig", results["indel_unambig"]) print("indel_ambig ", results["indel_ambig"]) print("ambig_one ", results["ambig_one"]) print("ambig_two ", results["ambig_two"]) print("ambig_both ", results["ambig_both"]) print("unambig_both ", results["unambig_both"]) if outfileName: with open(outfileName, "wt") as of: csvout = csv.writer(of, delimiter='\t') csvout.writerow(list(results.keys())) csvout.writerow(list(results.values()))
def clone_combinatorial( record_set: List[SeqRecord], enzymes: List[RestrictionType], include: List[str] = None, min_count: int = -1, linear: bool = True, ) -> List[Tuple[List[SeqRecord], List[SeqRecord]]]: """Parse a single list of SeqRecords to find all circularizable plasmids. Turn each SeqRecord's post-digest seqs into a graph where the nodes are the overhangs and the edges are the linear fragments post-digest/catalyzing with BsaI/BpiI. Args: record_set: single record set that might circularize enzymes: list of enzymes to digest the input records with Keyword Args: include: the include to filter assemblies min_count: mininum number of SeqRecords for an assembly to be considered linear: Whether the individual SeqRecords are assumed to be linear Returns: A list of tuples with: 1. plasmids that will form 2. SeqRecords that went into each formed plasmid """ graph = nx.MultiDiGraph() seen_seqs: Set[str] = set( ) # stored list of input seqs (not new combinations) for record in record_set: seen_seqs.add(str(record.seq + record.seq).upper()) seen_seqs.add( str((record.seq + record.seq).reverse_complement().upper())) for left, frag, right in _catalyze(record, enzymes, linear): graph.add_node(left) graph.add_node(right) graph.add_edge(left, right, frag=frag) try: # find all circularizable cycles cycles = simple_cycles(graph) except NetworkXNoCycle: return [] # get the fragments, enzymes back out of the cycle ids_to_fragments: Dict[str, List[SeqRecord]] = defaultdict(list) ids_to_plasmids: Dict[str, List[SeqRecord]] = defaultdict(list) for cycle in cycles: # filter for the minimum number of SeqRecords if min_count > 0 and len(cycle) < min_count: continue combinations = CombinatorialBins() for i, overhang in enumerate(cycle): next_overhang = cycle[(i + 1) % len(cycle)] record_bin = [] for out_edge in graph.out_edges(keys=True): src, dest, index = out_edge if src != overhang or dest != next_overhang: continue record_bin.append(graph.edges[src, dest, index]["frag"]) combinations.append(record_bin) for fragments in combinations: # create the composite plasmid plasmid = SeqRecord(Seq("", IUPACUnambiguousDNA())) for fragment in fragments: plasmid += fragment.upper() # make sure it's not just a re-ligation of insert + backbone plasmid_seq = str(plasmid.seq) if any(plasmid_seq in seq for seq in seen_seqs): continue # filter for plasmids that have an 'include' feature if not _has_features(plasmid, include): continue # re-order the fragments to try and match the input order fragments = _reorder_fragments(record_set, fragments) seen_seqs.add(str(plasmid.seq + plasmid.seq)) seen_seqs.add(str( (plasmid.seq + plasmid.seq).reverse_complement())) # make a unique id for the fragments fragments_id = _hash_fragments(fragments) ids_to_fragments[fragments_id] = fragments ids_to_plasmids[fragments_id].append(plasmid) plasmids_and_fragments: List[Tuple[List[SeqRecord], List[SeqRecord]]] = [] for ids, fragments in ids_to_fragments.items(): plasmids = ids_to_plasmids[ids] for i, plasmid in enumerate(plasmids): plasmid.id = "+".join(f.id for f in fragments if f.id != "<unknown id>") plasmid.description = f"cloned from {', '.join(str(e) for e in enzymes)}" if len(plasmids) > 1: plasmid.id += f"({i + 1})" plasmids_and_fragments.append((plasmids, fragments)) return plasmids_and_fragments
def _parse_row(row: str) -> Optional[SeqRecord]: """Parse a single row of the iGEM XML into a SeqRecord. I'm using the regex package here because there are characters in the XML's sha1 that throw the XML parsers that I tried. I know that this regex is slow... Args: row: A single 'row' element in the XML Returns: A SeqRecord with the part id, sequence, and description stored """ def match(regex): row_match = regex.search(row) if row_match: return row_match[1] return "" matches = [match(r) for r in RES] name, seq, desc_short, desc_long, cats, cache, ftype, nickname = matches if not seq or len(seq) > 10_000: return None features: List[SeqFeature] = [] feature_matches = RE_FEATURES.search(cache) if feature_matches: feature_match = feature_matches[1] for feature in feature_match.split("]"): feature = feature.replace(", [", "") if feature.count(",") < 4: continue f_type, f_start, f_end, f_name, f_strand, *_ = [ f.replace("[", "").replace("'", "").replace("(", "").strip() for f in feature.split(",") ] f_start_int = int(f_start) f_end_int = int(f_end) if (f_start_int == f_end_int or f_start_int > f_end_int or f_end_int - f_start_int < DNA_WORD_SIZE): continue features.append( SeqFeature( id=f_name, # have to -1 here. more CDS are of a length % 3 == 0 w/ this # I don't think this was enforced on iGEM teams when making features location=FeatureLocation(f_start_int - 1, f_end_int, 1 if f_strand == "0" else -1), type=_get_type(f_name, f_type.lower()), strand=1 if f_strand == "0" else -1, )) return SeqRecord( Seq(seq, IUPACUnambiguousDNA()), id=name, dbxrefs=[name], annotations={ "short_desc": desc_short, "description": desc_long, "categories": cats, "nickname": nickname, "type": _get_type(name, ftype), }, features=features, )
def fetch_ensembl_transcript(ensembl_transcript_id: str, is_retry: bool = False) -> SeqRecord: """Fetch the requested Ensembl transcript. Get the requested Ensembl transcript, together with exon and coding region (CDS) boundaries. Parameters ---------- ensembl_transcript_id : str the ensembl transcript id, of the form ENST... Returns ------- `Bio.SeqRecord` The requested transcript sequence, in 5' -> 3' order, together with exon and CDS features. The coordinates of exons and CDS features are relative to the sequence fragment. >>> fetch_ensembl_transcript('ENST00000398844').description 'chromosome:GRCh38:5:134648789:134727823:1' >>> fetch_ensembl_transcript('ATL3').description 'Reverse complement of chromosome:GRCh38:11:63624087:63671612:-1' >>> fs = fetch_ensembl_transcript('ENST00000398844').features >>> len([f for f in fs if f.type == 'exon']) 23 """ base_url = "http://rest.ensembl.org" if not ensembl_transcript_id.startswith('ENS'): # could be a gene symbol ensembl_transcript_id = _gene_to_enst(ensembl_transcript_id) # First, fetch the transcript sequence url = base_url + f"/sequence/id/{ensembl_transcript_id}" log.debug(f"Querying Ensembl for sequence of {ensembl_transcript_id}") response = _cached_session.get(url, params={ "type": "genomic", "content-type": "application/json" }) log.debug('Request cached: {}'.format( getattr(response, 'from_cache', False))) try: response.raise_for_status() except requests.exceptions.HTTPError: log.error("Ensembl sequence REST query returned error " "{}".format(response.text)) raise ValueError(response.text) response_data = response.json() try: description = response_data['desc'].split(':') species = description[1] chromosome_number = description[2] # may be X sequence_left = int(description[3]) sequence_right = int(description[4]) transcript_strand = int(description[5]) if sequence_left > sequence_right: raise ValueError( f"Expected left sequence boundary {sequence_left} " f"<= right sequence boundary {sequence_right}: did " "the format of the Ensembl REST response change?") sequence_id = response_data['id'] seq_str = response_data['seq'] log.debug(f"Retrieved sequence {response_data['desc']} of length " f"{sequence_right - sequence_left} for species {species} on " f"strand {transcript_strand}") except (KeyError, ValueError) as e: log.error(e) log.error( 'Error parsing sequence metadata from Ensembl REST response - ' 'did the format of the response change?') raise ValueError(e) if transcript_strand == -1: # If the transcript strand is -1, the sequence returned by # Ensembl is on the strand opposite the reference strand, # which is the strand of the Ensembl coordinates for # exons/coding regions. In this case, we initially store the # reverse complement of the sequence, and after fetching the # exon/coding regions, we'll return the reverse complement of # the `Bio.SeqRecord` object, which will properly re-index the # exon/coding regions. seq = Seq(seq_str, IUPACUnambiguousDNA()).reverse_complement() else: seq = Seq(seq_str, IUPACUnambiguousDNA()) record = SeqRecord(seq, id=sequence_id, description=":".join(description)) url = base_url + f"/overlap/id/{ensembl_transcript_id}" log.debug(f"Querying Ensembl for overlaps of {ensembl_transcript_id}") response = _cached_session.get(url, params={ "feature": ["cds", "exon"], "content-type": "application/json" }) try: response.raise_for_status() except requests.exceptions.HTTPError: log.error("Ensembl sequence REST query returned error " "{}".format(response.text)) # You have exceeded the limit of 15 requests per second; please reduce your concurrent connections if not is_retry and \ response.text.strip().startswith('You have exceeded the limit of'): log.warn( 'Waiting 4 seconds then retrying fetch_ensembl_transcript') time.sleep(4) return fetch_ensembl_transcript(ensembl_transcript_id, is_retry=True) raise ValueError(response.text) response_data = response.json() try: # Handle the unlikely event of a single piece of information # overlapping a lonely transcript if not hasattr(response_data, '__iter__'): response_data = [response_data] for response_datum in response_data: if response_datum['Parent'] != ensembl_transcript_id: continue if response_datum['assembly_name'] != species: continue # We store feature locations 0-indexed from the left-most # sequence boundary record.features.append( SeqFeature(location=FeatureLocation( int(response_datum['start']) - sequence_left, int(response_datum['end']) - sequence_left + 1, strand=int(response_datum['strand'])), type=response_datum['feature_type'])) num_exon_boundaries = len( [f for f in record.features if f.type == 'exon']) num_cds_boundaries = len( [f for f in record.features if f.type == 'cds']) log.debug(f"Retrieved {num_exon_boundaries} exons and " f"{num_cds_boundaries} coding regions for transcript " f"{ensembl_transcript_id}") except (KeyError, ValueError) as e: log.error(e) log.error( 'Error parsing overlap metadata from Ensembl REST response - ' 'did the format of the response change?') raise ValueError(e) if transcript_strand == -1: # By default `reverse_complement` doesn't preserve # description, so force it... record = record.reverse_complement(description=True) # ...but update the description to make clear the sequence # we're storing is the reverse complement of the sequence # described by the metadata in the description record.description = "Reverse complement of " + record.description record.annotations['reference_species'] = species record.annotations['reference_chromosome_number'] = chromosome_number record.annotations['reference_left_index'] = sequence_left record.annotations['reference_right_index'] = sequence_right record.annotations['transcript_strand'] = transcript_strand # Finally, sort features by their start locations record.features.sort(key=lambda f: f.location.start) return record
Created on Wed Oct 24 17:30:42 2018 @author: xies """ import numpy as np import matplotlib.pylab as plt import pandas as pd from Bio import SeqIO, motifs, SeqRecord from Bio.Alphabet.IUPAC import IUPACUnambiguousDNA from timeit import default_timer filename = '/data/crispri_hamming/nanog/nanog_chip_peaks.fa' peaks = [rec for rec in SeqIO.parse(filename, 'fasta')] for rec in peaks: rec.seq.alphabet = IUPACUnambiguousDNA() Npeaks = len(peaks) has_guide = np.zeros(Npeaks, dtype=bool) Lpeak = len(peaks[0]) # Load NANOG PWM and rewrite into JASPAR format filename = '/data/crispri_hamming/nanog/nanog_GSE11724.jaspar' with open(filename) as fh: m = [m for m in motifs.parse(fh, "jaspar")] pwm = m[0] pssm = pwm.pssm Lmotif = len(pwm) # Assuming 25-25-25-25 background, find min and max scores and 80% threshold min_score = pssm.min