Example #1
0
def get_dedup_seqs(fa_fn, dedup_by_id=True):        
    records = SeqIO.parse(fa_fn, "fasta")
    
    if dedup_by_id:
        print("Checking sequence ids for duplication")
    else:
        print("Checking sequence for duplication")
    
    
    dup_n = 0
    seq_guids = set()
    for record in records:
        if dedup_by_id:
            seq_guid = seguid(record.id)
        else:
            seq_guid = seguid(record.seq)
        
        if seq_guid in seq_guids:
            dup_n = dup_n + 1
            continue
        
        seq_guids.add(seq_guid)
        yield record
    
    print(str(dup_n) + " sequences are duplicates.")
Example #2
0
 def test_crc_checksum_collision(self):
     #Explicit testing of crc64 collision:
     self.assertNotEqual(self.str_light_chain_one, self.str_light_chain_two)
     self.assertNotEqual(crc32(self.str_light_chain_one), crc32(self.str_light_chain_two))
     self.assertEqual(crc64(self.str_light_chain_one), crc64(self.str_light_chain_two))
     self.assertNotEqual(gcg(self.str_light_chain_one), gcg(self.str_light_chain_two))
     self.assertNotEqual(seguid(self.str_light_chain_one), seguid(self.str_light_chain_two))
Example #3
0
    def test_uniprot_swiss(self):
        """Bio.TogoWS.entry("uniprot", ["A1AG1_HUMAN","A1AG1_MOUSE"])"""
        # Returns "swiss" format:
        handle = TogoWS.entry("uniprot", ["A1AG1_HUMAN", "A1AG1_MOUSE"])
        record1, record2 = SeqIO.parse(handle, "swiss")
        handle.close()

        self.assertEqual(record1.id, "P02763")
        self.assertEqual(record1.name, "A1AG1_HUMAN")
        self.assertEqual(len(record1), 201)
        self.assertEqual(seguid(record1.seq), "LHDJJ6oC7gUXo8CC7Xn6EUeA8Gk")

        self.assertEqual(record2.id, "Q60590")
        self.assertEqual(record2.name, "A1AG1_MOUSE")
        self.assertEqual(len(record2), 207)
        self.assertEqual(seguid(record2.seq), "FGcj+RFQhP2gRusCmwPFty5PJT0")
Example #4
0
def remove_dup_seqs(records):
    checksums = set()
    for record in records:
        checksum = seguid(record.seq)
        if checksum in checksums:
           continue
        checksums.add(checksum)
        yield record
Example #5
0
def checksum_summary(record) :
    if len(record.seq) < 25 :
        short = record.seq.tostring()
    else :
        short = record.seq.tostring()[:19] \
              + "..." + record.seq.tostring()[-3:]
    return "%s [%s] len %i" \
           % (short, seguid(record.seq), len(record.seq))
Example #6
0
def remove_dup_seqs(records):
    """"SeqRecord iterator to removing duplicate sequences."""
    checksums = set()
    for record in records:
        checksum = seguid(record.seq)
        if checksum in checksums:
            continue
        checksums.add(checksum)
        yield record
Example #7
0
def remove_duplicates(fasta):
    check_sums = set()
    for record in fasta:
        check_sum = seguid(record.seq)
        if check_sum in check_sums:
            logging.info("Ignoring record {0}".format(record.id))
            continue
        check_sums.add(check_sum)
        yield record
Example #8
0
 def test_get_sprot_raw(self):
     """Bio.ExPASy.get_sprot_raw("O23729")"""
     identifier = "O23729"
     handle = ExPASy.get_sprot_raw(identifier)
     record = SeqIO.read(handle, "swiss")
     handle.close()
     self.assertEqual(record.id, identifier)
     self.assertEqual(len(record), 394)
     self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
def get_node_lists(chains):
    result = {}
    for name in chains.keys():
        seq = seguid(chains.get(name))
        if seq in result:
            result[seq].append(name)
        else:
            result[seq] = [name]
    return list(result.values())
def checksum_summary(record):
    """Abbreviated string showing sequence, checksum, and length."""
    if isinstance(record.seq, UnknownSeq):
        return repr(record.seq)
    if len(record.seq) < 25:
        short = str(record.seq)
    else:
        short = str(record.seq)[:19] + "..." + str(record.seq)[-3:]
    return "%s [%s] len %i" % (short, seguid(record.seq), len(record.seq))
Example #11
0
 def test_ddbj_fasta(self):
     """Bio.TogoWS.entry("ddbj", "X52960", "fasta")"""
     handle = TogoWS.entry("ddbj", "X52960", "fasta")
     record = SeqIO.read(handle, "fasta")
     handle.close()
     self.assertTrue("X52960" in record.id, record.id)
     self.assertTrue("X52960" in record.name, record.name)
     self.assertEqual(len(record), 248)
     self.assertEqual(seguid(record.seq), "Ktxz0HgMlhQmrKTuZpOxPZJ6zGU")
Example #12
0
 def test_ddbj_genbank(self):
     """Bio.TogoWS.entry("ddbj", "X52960")"""
     handle = TogoWS.entry("ddbj", "X52960")  # Returns "genbank" format
     record = SeqIO.read(handle, "gb")
     handle.close()
     self.assertEqual(record.id, "X52960.1")
     self.assertEqual(record.name, "X52960")
     self.assertEqual(len(record), 248)
     self.assertEqual(seguid(record.seq), "Ktxz0HgMlhQmrKTuZpOxPZJ6zGU")
Example #13
0
 def test_nucleotide_fasta(self):
     """Bio.TogoWS.entry("nucleotide", "6273291", "fasta")"""
     handle = TogoWS.entry("nucleotide", "6273291", "fasta")
     record = SeqIO.read(handle, "fasta")
     handle.close()
     self.assertTrue("6273291" in record.id, record.id)
     self.assertTrue("6273291" in record.name, record.name)
     self.assertEqual(len(record), 902)
     self.assertEqual(seguid(record.seq), "bLhlq4mEFJOoS9PieOx4nhGnjAQ")
Example #14
0
 def test_embl_AM905444(self):
     """Bio.TogoWS.entry("embl", "AM905444")"""
     handle = TogoWS.entry("embl", "AM905444")
     record = SeqIO.read(handle, "embl")
     handle.close()
     self.assertTrue("AM905444" in record.id, record.id)
     self.assertTrue("AM905444" in record.name, record.name)
     self.assertTrue("porin" in record.description, record.description)
     self.assertEqual(len(record), 1164)
     self.assertEqual(seguid(record.seq), "G0HtLpwF7i4FXUaUjDUPTjok79c")
Example #15
0
def remove_duplicates(fasta):
    check_sums = set()
    for record in fasta:
        check_sum = seguid(record.seq)
        if check_sum in check_sums:
            print ("Ignoring record {0}".format(record.id))
            continue
        check_sums.add(check_sum)
        yield SeqIO.SeqRecord(id=record.id, description=record.description,
                              seq=record.seq, name=record.name)
def remove_dup_seqs(records):
    """"SeqRecord iterator to removing duplicate sequences."""
    checksums = set()
    for record in records:
        checksum = seguid(record.seq)
        if checksum in checksums:
            LOG_OUT.write("Ignoring %s \n" % record.id)
            continue
        checksums.add(checksum)
        yield record
Example #17
0
def unique_seqs(sequences):
    """returns a list of SeqRecord objects with redundant sequences removed"""
    unique_records = []
    checksum_container = []
    for seq in sequences:
        checksum = seguid(seq.seq)
        if checksum not in checksum_container:
            checksum_container.append(checksum)
            unique_records.append(seq)
    return unique_records
Example #18
0
def checksum_summary(record):
    if isinstance(record.seq, UnknownSeq):
        return repr(record.seq)
    if len(record.seq) < 25:
        short = record.seq.tostring()
    else:
        short = record.seq.tostring()[:19] \
              + "..." + record.seq.tostring()[-3:]
    return "%s [%s] len %i" \
           % (short, seguid(record.seq), len(record.seq))
Example #19
0
def checksum_summary(record):
    """Abbreviated string showing sequence, checksum, and length."""
    if isinstance(record.seq, UnknownSeq):
        return repr(record.seq)
    if len(record.seq) < 25:
        short = str(record.seq)
    else:
        short = str(record.seq)[:19] \
            + "..." + str(record.seq)[-3:]
    return "%s [%s] len %i" \
        % (short, seguid(record.seq), len(record.seq))
Example #20
0
def get_fastaseq(path):
    """
    This function will recursively check for content in files
     and only include unique sequence records.
    """
    seq_dict = []
    seguid_dict = []

    files = get_filepaths(path, allowedTypes)
    for file in files:
        print("")
        print("Found", len(files), "records in", file)
        for seq_record in SeqIO.parse(file, "fasta"):
            if seguid(seq_record.seq) not in seguid_dict:
                seguid_dict.append(seguid(seq_record.seq))
                print(seq_record.id, "is added to the list")
                seq_dict.append(seq_record)
            else:
                print(seq_record.id, "is already in list")
    return seq_dict
Example #21
0
 def seq_checksums(self, seq_str, exp_crc32, exp_crc64, exp_gcg, exp_seguid,
                   exp_simple_LCC, exp_window_LCC):
     for s in [seq_str,
               Seq(seq_str, single_letter_alphabet),
               MutableSeq(seq_str, single_letter_alphabet)]:
         self.assertEqual(exp_crc32, u_crc32(s))
         self.assertEqual(exp_crc64, crc64(s))
         self.assertEqual(exp_gcg, gcg(s))
         self.assertEqual(exp_seguid, seguid(s))
         self.assertEqual(exp_simple_LCC, simple_LCC(s))
         self.assertEqual(exp_window_LCC, windowed_LCC(s))
Example #22
0
 def test_protein_fasta(self):
     """Bio.TogoWS.entry("protein", "16130152", "fasta")"""
     handle = TogoWS.entry("protein", "16130152", "fasta")
     record = SeqIO.read(handle, "fasta")
     handle.close()
     # Could use assertIn but requires Python 2.7+
     self.assertTrue("16130152" in record.id, record.id)
     self.assertTrue("16130152" in record.name, record.name)
     self.assertTrue("porin protein" in record.description, record.description)
     self.assertEqual(len(record), 367)
     self.assertEqual(seguid(record.seq), "fCjcjMFeGIrilHAn6h+yju267lg")
Example #23
0
 def simple(self, database, formats, entry, length, checksum):
     for f in formats:
         handle = Entrez.efetch(db=database, id=entry, rettype=f)
         record = SeqIO.read(handle, f)
         handle.close()
         self.assert_((entry in record.name) or \
                      (entry in record.id) or \
                      ("gi" in record.annotations \
                       and record.annotations["gi"]==entry),
                      "%s got %s, %s" % (entry, record.name, record.id))
         self.assertEqual(len(record), length)
         self.assertEqual(seguid(record.seq), checksum)
Example #24
0
 def test_nucleotide_fasta(self):
     """Bio.TogoWS.entry("nucleotide", "6273291", "fasta")"""
     handle = TogoWS.entry("nucleotide", "6273291", "fasta")
     record = SeqIO.read(handle, "fasta")
     handle.close()
     # NCBI is phasing out GI numbers, so no longer true:
     # self.assertIn("6273291", record.id)
     # self.assertIn("6273291", record.name)
     self.assertIn("AF191665.1", record.id)
     self.assertIn("AF191665.1", record.name)
     self.assertEqual(len(record), 902)
     self.assertEqual(seguid(record.seq), "bLhlq4mEFJOoS9PieOx4nhGnjAQ")
Example #25
0
 def test_get_sprot_raw(self):
     """Bio.ExPASy.get_sprot_raw("O23729")"""
     identifier = "O23729"
     # This is to catch an error page from our proxy:
     handle = UndoHandle(ExPASy.get_sprot_raw(identifier))
     if _as_string(handle.peekline()).startswith("<!DOCTYPE HTML"):
         raise IOError
     record = SeqIO.read(handle, "swiss")
     handle.close()
     self.assertEqual(record.id, identifier)
     self.assertEqual(len(record), 394)
     self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
Example #26
0
 def test_protein_fasta(self):
     """Bio.TogoWS.entry("protein", "16130152", "fasta")"""
     handle = TogoWS.entry("protein", "16130152", "fasta")
     record = SeqIO.read(handle, "fasta")
     handle.close()
     # NCBI is phasing out GI numbers, so no longer true:
     # self.assertIn("16130152", record.id)
     # self.assertIn("16130152", record.name)
     self.assertIn("NP_416719.1", record.id)
     self.assertIn("NP_416719.1", record.name)
     self.assertIn("porin protein", record.description)
     self.assertEqual(len(record), 367)
     self.assertEqual(seguid(record.seq), "fCjcjMFeGIrilHAn6h+yju267lg")
Example #27
0
def aln_undup(alignment):
    """Removes duplicate keys"""
    aln=MultipleSeqAlignment([])
    checksums = set()
    for record in alignment:
        checksum = seguid(record.seq)
        if checksum in checksums:
            print "Ignoring %s" % record.id
            continue
        checksums.add(checksum)
        aln.append(record)

    return aln
Example #28
0
 def simple(self, database, formats, entry, length, checksum):
     for f in formats:
         handle = Entrez.efetch(db=database, id=entry, rettype=f, retmode="text")
         if f == "gbwithparts":
             f = "gb"
         record = SeqIO.read(handle, f)
         handle.close()
         self.assertTrue((entry in record.name) or
                      (entry in record.id) or
                      ("gi" in record.annotations
                       and record.annotations["gi"]==entry),
                      "%s got %s, %s" % (entry, record.name, record.id))
         self.assertEqual(len(record), length)
         self.assertEqual(seguid(record.seq), checksum)
 def test_get_sprot_raw(self):
     """Bio.ExPASy.get_sprot_raw("O23729")"""
     identifier = "O23729"
     try:
         #This is to catch an error page from our proxy:
         handle = UndoHandle(ExPASy.get_sprot_raw(identifier))
         if _as_string(handle.peekline()).startswith("<!DOCTYPE HTML"):
             raise IOError
         record = SeqIO.read(handle, "swiss")
         handle.close()
     except IOError:
         raise MissingExternalDependencyError(
               "internet (or maybe just ExPASy) not available")
     self.assertEqual(record.id, identifier)
     self.assertEqual(len(record), 394)
     self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
Example #30
0
def unique_ids(sequences):
    """returns a list of SeqRecord objects with redundant ids renamed"""
    unique_records = []
    checksum_container = []
    redundant_id_count = 0
    for seq in sequences:
        checksum = seguid(seq.id)
        if checksum not in checksum_container:
            checksum_container.append(checksum)
            unique_records.append(seq)
        else:
            print("repeated id detected, adding '.{}' suffix".format(redundant_id_count))
            seq.id = "{}.{}".format(seq.id, redundant_id_count)
            unique_records.append(seq)
            redundant_id_count += 1
    return unique_records
 def simple(self, database, formats, entry, length, checksum):
     for f in formats:
         try:
             handle = Entrez.efetch(db=database, id=entry, rettype=f)
             record = SeqIO.read(handle, f)
             handle.close()
         except IOError:
             raise MissingExternalDependencyError(
                   "internet (or maybe just NCBI) not available")
         self.assertTrue((entry in record.name) or \
                      (entry in record.id) or \
                      ("gi" in record.annotations \
                       and record.annotations["gi"]==entry),
                      "%s got %s, %s" % (entry, record.name, record.id))
         self.assertEqual(len(record), length)
         self.assertEqual(seguid(record.seq), checksum)