def get_dedup_seqs(fa_fn, dedup_by_id=True): records = SeqIO.parse(fa_fn, "fasta") if dedup_by_id: print("Checking sequence ids for duplication") else: print("Checking sequence for duplication") dup_n = 0 seq_guids = set() for record in records: if dedup_by_id: seq_guid = seguid(record.id) else: seq_guid = seguid(record.seq) if seq_guid in seq_guids: dup_n = dup_n + 1 continue seq_guids.add(seq_guid) yield record print(str(dup_n) + " sequences are duplicates.")
def test_crc_checksum_collision(self): #Explicit testing of crc64 collision: self.assertNotEqual(self.str_light_chain_one, self.str_light_chain_two) self.assertNotEqual(crc32(self.str_light_chain_one), crc32(self.str_light_chain_two)) self.assertEqual(crc64(self.str_light_chain_one), crc64(self.str_light_chain_two)) self.assertNotEqual(gcg(self.str_light_chain_one), gcg(self.str_light_chain_two)) self.assertNotEqual(seguid(self.str_light_chain_one), seguid(self.str_light_chain_two))
def test_uniprot_swiss(self): """Bio.TogoWS.entry("uniprot", ["A1AG1_HUMAN","A1AG1_MOUSE"])""" # Returns "swiss" format: handle = TogoWS.entry("uniprot", ["A1AG1_HUMAN", "A1AG1_MOUSE"]) record1, record2 = SeqIO.parse(handle, "swiss") handle.close() self.assertEqual(record1.id, "P02763") self.assertEqual(record1.name, "A1AG1_HUMAN") self.assertEqual(len(record1), 201) self.assertEqual(seguid(record1.seq), "LHDJJ6oC7gUXo8CC7Xn6EUeA8Gk") self.assertEqual(record2.id, "Q60590") self.assertEqual(record2.name, "A1AG1_MOUSE") self.assertEqual(len(record2), 207) self.assertEqual(seguid(record2.seq), "FGcj+RFQhP2gRusCmwPFty5PJT0")
def remove_dup_seqs(records): checksums = set() for record in records: checksum = seguid(record.seq) if checksum in checksums: continue checksums.add(checksum) yield record
def checksum_summary(record) : if len(record.seq) < 25 : short = record.seq.tostring() else : short = record.seq.tostring()[:19] \ + "..." + record.seq.tostring()[-3:] return "%s [%s] len %i" \ % (short, seguid(record.seq), len(record.seq))
def remove_dup_seqs(records): """"SeqRecord iterator to removing duplicate sequences.""" checksums = set() for record in records: checksum = seguid(record.seq) if checksum in checksums: continue checksums.add(checksum) yield record
def remove_duplicates(fasta): check_sums = set() for record in fasta: check_sum = seguid(record.seq) if check_sum in check_sums: logging.info("Ignoring record {0}".format(record.id)) continue check_sums.add(check_sum) yield record
def test_get_sprot_raw(self): """Bio.ExPASy.get_sprot_raw("O23729")""" identifier = "O23729" handle = ExPASy.get_sprot_raw(identifier) record = SeqIO.read(handle, "swiss") handle.close() self.assertEqual(record.id, identifier) self.assertEqual(len(record), 394) self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
def get_node_lists(chains): result = {} for name in chains.keys(): seq = seguid(chains.get(name)) if seq in result: result[seq].append(name) else: result[seq] = [name] return list(result.values())
def checksum_summary(record): """Abbreviated string showing sequence, checksum, and length.""" if isinstance(record.seq, UnknownSeq): return repr(record.seq) if len(record.seq) < 25: short = str(record.seq) else: short = str(record.seq)[:19] + "..." + str(record.seq)[-3:] return "%s [%s] len %i" % (short, seguid(record.seq), len(record.seq))
def test_ddbj_fasta(self): """Bio.TogoWS.entry("ddbj", "X52960", "fasta")""" handle = TogoWS.entry("ddbj", "X52960", "fasta") record = SeqIO.read(handle, "fasta") handle.close() self.assertTrue("X52960" in record.id, record.id) self.assertTrue("X52960" in record.name, record.name) self.assertEqual(len(record), 248) self.assertEqual(seguid(record.seq), "Ktxz0HgMlhQmrKTuZpOxPZJ6zGU")
def test_ddbj_genbank(self): """Bio.TogoWS.entry("ddbj", "X52960")""" handle = TogoWS.entry("ddbj", "X52960") # Returns "genbank" format record = SeqIO.read(handle, "gb") handle.close() self.assertEqual(record.id, "X52960.1") self.assertEqual(record.name, "X52960") self.assertEqual(len(record), 248) self.assertEqual(seguid(record.seq), "Ktxz0HgMlhQmrKTuZpOxPZJ6zGU")
def test_nucleotide_fasta(self): """Bio.TogoWS.entry("nucleotide", "6273291", "fasta")""" handle = TogoWS.entry("nucleotide", "6273291", "fasta") record = SeqIO.read(handle, "fasta") handle.close() self.assertTrue("6273291" in record.id, record.id) self.assertTrue("6273291" in record.name, record.name) self.assertEqual(len(record), 902) self.assertEqual(seguid(record.seq), "bLhlq4mEFJOoS9PieOx4nhGnjAQ")
def test_embl_AM905444(self): """Bio.TogoWS.entry("embl", "AM905444")""" handle = TogoWS.entry("embl", "AM905444") record = SeqIO.read(handle, "embl") handle.close() self.assertTrue("AM905444" in record.id, record.id) self.assertTrue("AM905444" in record.name, record.name) self.assertTrue("porin" in record.description, record.description) self.assertEqual(len(record), 1164) self.assertEqual(seguid(record.seq), "G0HtLpwF7i4FXUaUjDUPTjok79c")
def remove_duplicates(fasta): check_sums = set() for record in fasta: check_sum = seguid(record.seq) if check_sum in check_sums: print ("Ignoring record {0}".format(record.id)) continue check_sums.add(check_sum) yield SeqIO.SeqRecord(id=record.id, description=record.description, seq=record.seq, name=record.name)
def remove_dup_seqs(records): """"SeqRecord iterator to removing duplicate sequences.""" checksums = set() for record in records: checksum = seguid(record.seq) if checksum in checksums: LOG_OUT.write("Ignoring %s \n" % record.id) continue checksums.add(checksum) yield record
def unique_seqs(sequences): """returns a list of SeqRecord objects with redundant sequences removed""" unique_records = [] checksum_container = [] for seq in sequences: checksum = seguid(seq.seq) if checksum not in checksum_container: checksum_container.append(checksum) unique_records.append(seq) return unique_records
def checksum_summary(record): if isinstance(record.seq, UnknownSeq): return repr(record.seq) if len(record.seq) < 25: short = record.seq.tostring() else: short = record.seq.tostring()[:19] \ + "..." + record.seq.tostring()[-3:] return "%s [%s] len %i" \ % (short, seguid(record.seq), len(record.seq))
def checksum_summary(record): """Abbreviated string showing sequence, checksum, and length.""" if isinstance(record.seq, UnknownSeq): return repr(record.seq) if len(record.seq) < 25: short = str(record.seq) else: short = str(record.seq)[:19] \ + "..." + str(record.seq)[-3:] return "%s [%s] len %i" \ % (short, seguid(record.seq), len(record.seq))
def get_fastaseq(path): """ This function will recursively check for content in files and only include unique sequence records. """ seq_dict = [] seguid_dict = [] files = get_filepaths(path, allowedTypes) for file in files: print("") print("Found", len(files), "records in", file) for seq_record in SeqIO.parse(file, "fasta"): if seguid(seq_record.seq) not in seguid_dict: seguid_dict.append(seguid(seq_record.seq)) print(seq_record.id, "is added to the list") seq_dict.append(seq_record) else: print(seq_record.id, "is already in list") return seq_dict
def seq_checksums(self, seq_str, exp_crc32, exp_crc64, exp_gcg, exp_seguid, exp_simple_LCC, exp_window_LCC): for s in [seq_str, Seq(seq_str, single_letter_alphabet), MutableSeq(seq_str, single_letter_alphabet)]: self.assertEqual(exp_crc32, u_crc32(s)) self.assertEqual(exp_crc64, crc64(s)) self.assertEqual(exp_gcg, gcg(s)) self.assertEqual(exp_seguid, seguid(s)) self.assertEqual(exp_simple_LCC, simple_LCC(s)) self.assertEqual(exp_window_LCC, windowed_LCC(s))
def test_protein_fasta(self): """Bio.TogoWS.entry("protein", "16130152", "fasta")""" handle = TogoWS.entry("protein", "16130152", "fasta") record = SeqIO.read(handle, "fasta") handle.close() # Could use assertIn but requires Python 2.7+ self.assertTrue("16130152" in record.id, record.id) self.assertTrue("16130152" in record.name, record.name) self.assertTrue("porin protein" in record.description, record.description) self.assertEqual(len(record), 367) self.assertEqual(seguid(record.seq), "fCjcjMFeGIrilHAn6h+yju267lg")
def simple(self, database, formats, entry, length, checksum): for f in formats: handle = Entrez.efetch(db=database, id=entry, rettype=f) record = SeqIO.read(handle, f) handle.close() self.assert_((entry in record.name) or \ (entry in record.id) or \ ("gi" in record.annotations \ and record.annotations["gi"]==entry), "%s got %s, %s" % (entry, record.name, record.id)) self.assertEqual(len(record), length) self.assertEqual(seguid(record.seq), checksum)
def test_nucleotide_fasta(self): """Bio.TogoWS.entry("nucleotide", "6273291", "fasta")""" handle = TogoWS.entry("nucleotide", "6273291", "fasta") record = SeqIO.read(handle, "fasta") handle.close() # NCBI is phasing out GI numbers, so no longer true: # self.assertIn("6273291", record.id) # self.assertIn("6273291", record.name) self.assertIn("AF191665.1", record.id) self.assertIn("AF191665.1", record.name) self.assertEqual(len(record), 902) self.assertEqual(seguid(record.seq), "bLhlq4mEFJOoS9PieOx4nhGnjAQ")
def test_get_sprot_raw(self): """Bio.ExPASy.get_sprot_raw("O23729")""" identifier = "O23729" # This is to catch an error page from our proxy: handle = UndoHandle(ExPASy.get_sprot_raw(identifier)) if _as_string(handle.peekline()).startswith("<!DOCTYPE HTML"): raise IOError record = SeqIO.read(handle, "swiss") handle.close() self.assertEqual(record.id, identifier) self.assertEqual(len(record), 394) self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
def test_protein_fasta(self): """Bio.TogoWS.entry("protein", "16130152", "fasta")""" handle = TogoWS.entry("protein", "16130152", "fasta") record = SeqIO.read(handle, "fasta") handle.close() # NCBI is phasing out GI numbers, so no longer true: # self.assertIn("16130152", record.id) # self.assertIn("16130152", record.name) self.assertIn("NP_416719.1", record.id) self.assertIn("NP_416719.1", record.name) self.assertIn("porin protein", record.description) self.assertEqual(len(record), 367) self.assertEqual(seguid(record.seq), "fCjcjMFeGIrilHAn6h+yju267lg")
def aln_undup(alignment): """Removes duplicate keys""" aln=MultipleSeqAlignment([]) checksums = set() for record in alignment: checksum = seguid(record.seq) if checksum in checksums: print "Ignoring %s" % record.id continue checksums.add(checksum) aln.append(record) return aln
def simple(self, database, formats, entry, length, checksum): for f in formats: handle = Entrez.efetch(db=database, id=entry, rettype=f, retmode="text") if f == "gbwithparts": f = "gb" record = SeqIO.read(handle, f) handle.close() self.assertTrue((entry in record.name) or (entry in record.id) or ("gi" in record.annotations and record.annotations["gi"]==entry), "%s got %s, %s" % (entry, record.name, record.id)) self.assertEqual(len(record), length) self.assertEqual(seguid(record.seq), checksum)
def test_get_sprot_raw(self): """Bio.ExPASy.get_sprot_raw("O23729")""" identifier = "O23729" try: #This is to catch an error page from our proxy: handle = UndoHandle(ExPASy.get_sprot_raw(identifier)) if _as_string(handle.peekline()).startswith("<!DOCTYPE HTML"): raise IOError record = SeqIO.read(handle, "swiss") handle.close() except IOError: raise MissingExternalDependencyError( "internet (or maybe just ExPASy) not available") self.assertEqual(record.id, identifier) self.assertEqual(len(record), 394) self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
def unique_ids(sequences): """returns a list of SeqRecord objects with redundant ids renamed""" unique_records = [] checksum_container = [] redundant_id_count = 0 for seq in sequences: checksum = seguid(seq.id) if checksum not in checksum_container: checksum_container.append(checksum) unique_records.append(seq) else: print("repeated id detected, adding '.{}' suffix".format(redundant_id_count)) seq.id = "{}.{}".format(seq.id, redundant_id_count) unique_records.append(seq) redundant_id_count += 1 return unique_records
def simple(self, database, formats, entry, length, checksum): for f in formats: try: handle = Entrez.efetch(db=database, id=entry, rettype=f) record = SeqIO.read(handle, f) handle.close() except IOError: raise MissingExternalDependencyError( "internet (or maybe just NCBI) not available") self.assertTrue((entry in record.name) or \ (entry in record.id) or \ ("gi" in record.annotations \ and record.annotations["gi"]==entry), "%s got %s, %s" % (entry, record.name, record.id)) self.assertEqual(len(record), length) self.assertEqual(seguid(record.seq), checksum)