def test_rich_label(self): """rich label correctly constructs label strings""" # labels should be equal based on the result of applying their # attributes to their string template k = RichLabel(Info(species="rat"), "%(species)s") l = RichLabel(Info(species="rat", seq_id="xy5"), "%(species)s") self.assertEqual(k, l) # labels should construct from Info components correctly k = RichLabel(Info(species="rat", seq_id="xy5"), "%(seq_id)s:%(species)s") self.assertEqual(k, "xy5:rat") k = RichLabel(Info(species="rat", seq_id="xy5"), "%(species)s:%(seq_id)s") self.assertEqual(k, "rat:xy5") # extra components should be ignored k = RichLabel(Info(species="rat", seq_id="xy5"), "%(species)s") self.assertEqual(k, "rat") # the label should have Info object self.assertEqual(k.info.species, "rat") self.assertEqual(k.info.seq_id, "xy5") # label should be constructable just like a normal string self.assertEqual(RichLabel("a"), "a")
def test_single_constructor(self): """RdbParser should use constructors if supplied""" to_dna = lambda x, info: DnaSequence(str(x).replace("U", "T"), info=info) f = list(RdbParser(self.oneseq, to_dna)) self.assertEqual(len(f), 1) a = f[0] self.assertEqual(a, "AGTCATCTAGATHCATHC") self.assertEqual( a.info, Info({"Species": "H.Sapiens", "OriginalSeq": "AGUCAUCUAGAUHCAUHC"}) ) def alternativeConstr(header_lines): info = Info() for line in header_lines: all = line.strip().split(":", 1) # strip out empty lines, lines without name, lines without # colon if not all[0] or len(all) != 2: continue name = all[0].upper() value = all[1].strip().upper() info[name] = value return info f = list(RdbParser(self.oneseq, to_dna, alternativeConstr)) self.assertEqual(len(f), 1) a = f[0] self.assertEqual(a, "AGTCATCTAGATHCATHC") exp_info = Info( {"OriginalSeq": "AGUCAUCUAGAUHCAUHC", "Refs": {}, "SEQ": "H.SAPIENS"} ) self.assertEqual( a.info, Info({"OriginalSeq": "AGUCAUCUAGAUHCAUHC", "Refs": {}, "SEQ": "H.SAPIENS"}), )
def test_update(self): """update should warn the user of overlapping keys""" with warnings.catch_warnings(record=True) as w: d1 = Info({"key1": "value1", "key2": "value2", "key3": "value3"}) d2 = Info({"key2": "value2", "key3": "value3", "key4": "value4"}) d3 = d1.update(d2) self.assertEqual(len(w), 1)
def test_single(self): """RdbParser should read single record as (header,seq) tuple""" res = list(RdbParser(self.oneseq)) self.assertEqual(len(res), 1) first = res[0] self.assertEqual(first, Sequence("AGUCAUCUAGAUHCAUHC")) self.assertEqual( first.info, Info({ "Species": "H.Sapiens", "OriginalSeq": "AGUCAUCUAGAUHCAUHC" }), ) res = list(RdbParser(self.multiline)) self.assertEqual(len(res), 1) first = res[0] self.assertEqual(first, Sequence("AGUCAUUAGAUHCAUHC")) self.assertEqual( first.info, Info({ "Species": "H.Sapiens", "OriginalSeq": "AGUCAUUAGAUHCAUHC" }), )
def GroupFastaParser( data, label_to_name, group_key="Group", aligned=False, moltype=ASCII, done_groups=None, DEBUG=False, ): """yields related sequences as a separate seq collection Parameters ---------- data line iterable data source label_to_name LabelParser callback group_key name of group key in RichLabel.info object aligned whether sequences are to be considered aligned moltype default is ASCII done_groups series of group keys to be excluded """ done_groups = [[], done_groups][done_groups is not None] parser = MinimalFastaParser(data, label_to_name=label_to_name, finder=XmfaFinder) group_ids = [] current_collection = {} for label, seq in parser: seq = moltype.make_seq(seq, name=label, info=label.info) if DEBUG: print("str(label) ", str(label), "repr(label)", repr(label)) if not group_ids or label.info[group_key] in group_ids: current_collection[label] = seq if not group_ids: group_ids.append(label.info[group_key]) else: # we finish off check of current before creating a collection if group_ids[-1] not in done_groups: info = Info(Group=group_ids[-1]) if DEBUG: print("GroupParser collection keys", list(current_collection.keys())) seqs = cogent3.make_aligned_seqs(current_collection, moltype=moltype) seqs.info = info yield seqs current_collection = {label: seq} group_ids.append(label.info[group_key]) info = Info(Group=group_ids[-1]) func = cogent3.make_aligned_seqs if aligned else cogent3.make_unaligned_seqs seqs = func(current_collection, moltype=moltype, info=info) yield seqs
def test_full(self): """RdbParser: full data, valid and invalid""" # when only good record, should work independent of strict r1 = RnaSequence( "-??GG-UGAA--CGCU---ACGU-N???---", info=Info({ "Species": "unidentified Thermus OPB AF027020", "Refs": { "rRNA": ["AF027020"] }, "OriginalSeq": "-o[oGG-U{G}AA--C^GC]U---ACGU-Nooo---", }), ) r2 = RnaSequence( "---CGAUCG--UAUACG-N???-", info=Info({ "Species": "Thermus silvanus X84211", "Refs": { "rRNA": ["X84211"] }, "OriginalSeq": "---CGAU[C(G){--UA}U]ACG-Nooo-", }), ) obs = list(RdbParser(RDB_LINES_ONLY_GOOD.split("\n"), strict=True)) self.assertEqual(len(obs), 2) self.assertEqual(obs[0], r1) self.assertEqual(str(obs[0]), str(r1)) self.assertEqual(obs[0].info, r1.info) self.assertEqual(obs[1], r2) self.assertEqual(str(obs[1]), str(r2)) self.assertEqual(obs[1].info, r2.info) obs = list(RdbParser(RDB_LINES_ONLY_GOOD.split("\n"), strict=False)) self.assertEqual(len(obs), 2) self.assertEqual(obs[0], r1) self.assertEqual(str(obs[0]), str(r1)) self.assertEqual(obs[0].info, r1.info) # when strict, should raise error on invalid record f = RdbParser(RDB_LINES_GOOD_BAD.split("\n"), strict=True) self.assertRaises(RecordError, list, f) # when not strict, malicious record is skipped obs = list(RdbParser(RDB_LINES_GOOD_BAD.split("\n"), strict=False)) self.assertEqual(len(obs), 2) self.assertEqual(obs[0], r1) self.assertEqual(str(obs[0]), str(r1)) self.assertEqual(obs[0].info, r1.info) self.assertEqual(obs[1], r2) self.assertEqual(str(obs[1]), str(r2)) self.assertEqual(obs[1].info, r2.info)
def NcbiFastaLabelParser(line): """Creates an Info object and populates it with the line contents. As of 11/12/03, all records in genpept.fsa and the human RefSeq fasta files were consistent with this format. """ info = Info() try: ignore, gi, db, db_ref, description = list(map(strip, line.split("|", 4))) except ValueError: # probably got wrong value raise RecordError("Unable to parse label line %s" % line) info.GI = gi info[NcbiLabels[db]] = db_ref info.Description = description return gi, info
def setUp(self): """Setup for Fasta tests.""" self.strings = ["AAAA", "CCCC", "gggg", "uuuu"] self.labels = ["1st", "2nd", "3rd", "4th"] self.infos = ["Dog", "Cat", "Mouse", "Rat"] self.sequences_with_labels = list(map(Sequence, self.strings)) self.sequences_with_names = list(map(Sequence, self.strings)) for l, sl, sn in zip(self.labels, self.sequences_with_labels, self.sequences_with_names): sl.label = l sn.name = l self.fasta_no_label = ">0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu\n" self.fasta_with_label = ">1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU\n" self.fasta_with_label_lw2 = ( ">1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU\n") self.alignment_dict = { "1st": "AAAA", "2nd": "CCCC", "3rd": "GGGG", "4th": "UUUU", } self.alignment_object = Alignment(self.alignment_dict) for label, info in zip(self.labels, self.infos): self.alignment_object.named_seqs[label].info = Info(species=info) self.fasta_with_label_species = ( ">1st:Dog\nAAAA\n>2nd:Cat\nCCCC\n>3rd:Mouse\nGGGG\n>4th:Rat\nUUUU\n" ) self.alignment_object.RowOrder = ["1st", "2nd", "3rd", "4th"]
def test_init_empty(self): """Info empty init should work as expected""" d = Info() self.assertEqual(len(d), 1) self.assertContains(d, "Refs") self.assertEqual(d.Refs, DbRefs()) self.assertTrue(isinstance(d.Refs, DbRefs))
def test_multiple_constructor_bad(self): """RdbParser should complain or skip bad records w/ constructor""" def dnastrict(x, **kwargs): try: return DnaSequence(x, **kwargs) except Exception: raise RecordError("Could not convert sequence") self.assertRaises(RecordError, list, RdbParser(self.oneX, dnastrict)) f = list(RdbParser(self.oneX, dnastrict, strict=False)) self.assertEqual(len(f), 2) a, b = f self.assertEqual(a, "ACT") self.assertEqual(a.info, Info({"Species": "mit", "OriginalSeq": "ACT"})) self.assertEqual(b, "AAA") self.assertEqual(b.info, Info({"Species": "pla", "OriginalSeq": "AAA"}))
def alternativeConstr(header_lines): info = Info() for line in header_lines: all = line.strip().split(":", 1) # strip out empty lines, lines without name, lines without # colon if not all[0] or len(all) != 2: continue name = all[0].upper() value = all[1].strip().upper() info[name] = value return info
def test_init_data(self): """Info init with data should put items in correct places""" # need to check init, setting, and resetting of attributes that belong # in the Info object and attributes that belong in Info.Refs. Also need # to check __getitem__, __setitem__, and __contains__. d = Info({"x": 3, "GO": 12345}) self.assertEqual(d.x, 3) self.assertEqual(d.GO, [12345]) self.assertEqual(d.Refs.GO, [12345]) try: del d.Refs except AttributeError: pass else: raise Exception("Failed to prevent deletion of required key Refs" "") d.GenBank = ("qaz", "wsx") self.assertEqual(d.GenBank, ["qaz", "wsx"]) self.assertContains(d.Refs, "GenBank") self.assertContains(d, "GenBank") d.GenBank = "xyz" self.assertEqual(d.GenBank, ["xyz"]) self.assertSameObj(d.GenBank, d.Refs.GenBank) d.GO = "x" self.assertEqual(d.GO, ["x"]) d.GO.append("y") self.assertEqual(d.GO, ["x", "y"]) d.ZZZ = "zzz" self.assertEqual(d.ZZZ, "zzz") self.assertNotContains(d.Refs, "ZZZ") self.assertNotContains(d, "XXX") self.assertEqual(d.XXX, None)
def InfoMaker(header_lines): """Returns an Info object constructed from the headerLines.""" info = Info() for line in header_lines: all = line.strip().split(":", 1) # strip out empty lines, lines without name, lines without colon if not all[0] or len(all) != 2: continue try: name = _field_names[all[0]] except KeyError: name = all[0] value = all[1].strip() info[name] = value return info
def call(label): label = [label, label[1:]][label[0] == ">"] label = sep.split(label) if DEBUG: print(label) info = Info() for index, name, converter in field_formatters: if isinstance(converter, Callable): try: info[name] = converter(label[index]) except IndexError: raise IndexError( "parsing label %s failed for property %s at index %s" % (label, name, index)) else: info[name] = label[index] return RichLabel(info, display_template)
def test_full(self): """InfoMaker should return Info object with name, value pairs""" test_header = [ "acc: X3402", "abc:1", "mty: ssu", "seq: Mit. X3402", "", "nonsense", ":no_name", ] obs = InfoMaker(test_header) exp = Info() exp.rRNA = "X3402" exp.abc = "1" exp.Species = "Mit. X3402" exp.Gene = "ssu" self.assertEqual(obs, exp)
def RichGenbankParser(handle, info_excludes=None, moltype=None, skip_contigs=False, add_annotation=None): """Returns annotated sequences from GenBank formatted file. Parameters ---------- info_excludes a series of fields to be excluded from the Info object moltype a MolType instance, such as PROTEIN, DNA. Default is ASCII. skip_contigs ignores records with no actual sequence data, typically a genomic contig. add_annotation a callback function to create an new annotation from a GenBank feature. Function is called with the sequence, a feature dict and the feature spans. """ info_excludes = info_excludes or [] moltype = moltype or ASCII for rec in MinimalGenbankParser(handle): info = Info() # populate the info object, excluding the sequence for label, value in list(rec.items()): if label in info_excludes: continue info[label] = value if rec["mol_type"] == "protein": # which it doesn't for genbank moltype = PROTEIN elif rec["mol_type"] == "DNA": moltype = DNA try: seq = moltype.make_seq(rec["sequence"].upper(), info=info, name=rec["locus"]) except KeyError: if not skip_contigs: if "contig" in rec: yield rec["locus"], rec["contig"] elif "WGS" in rec: yield rec["locus"], rec["WGS"] else: yield rec["locus"], None continue for feature in rec["features"]: spans = [] reversed = None if feature["location"] is None or feature["type"] in [ "source", "organism" ]: continue for location in feature["location"]: (lo, hi) = (location.first() - 1, location.last()) if location.Strand == -1: (lo, hi) = (hi, lo) assert reversed is not False reversed = True else: assert reversed is not True reversed = False # ensure we don't put in a span that starts beyond the sequence if lo > len(seq): continue # or that's longer than the sequence hi = [hi, len(seq)][hi > len(seq)] spans.append((lo, hi)) if add_annotation: add_annotation(seq, feature, spans) else: for id_field in ["gene", "note", "product", "clone"]: if id_field in feature: name = feature[id_field] if not isinstance(name, str): name = " ".join(name) break else: name = None seq.add_annotation(Feature, feature["type"], name, spans) yield (rec["locus"], seq)
def test_identity(self): """Info should get its own new Refs when created""" i = Info() j = Info() self.assertNotSameObj(i, j) self.assertNotSameObj(i.Refs, j.Refs)
def test_empty(self): """InfoMaker: should return empty Info from empty header""" empty_header = [] obs = InfoMaker(empty_header) exp = Info() self.assertEqual(obs, exp)