コード例 #1
0
    def test_rich_label(self):
        """rich label correctly constructs label strings"""
        # labels should be equal based on the result of applying their
        # attributes to their string template
        k = RichLabel(Info(species="rat"), "%(species)s")
        l = RichLabel(Info(species="rat", seq_id="xy5"), "%(species)s")
        self.assertEqual(k, l)

        # labels should construct from Info components correctly
        k = RichLabel(Info(species="rat", seq_id="xy5"),
                      "%(seq_id)s:%(species)s")
        self.assertEqual(k, "xy5:rat")
        k = RichLabel(Info(species="rat", seq_id="xy5"),
                      "%(species)s:%(seq_id)s")
        self.assertEqual(k, "rat:xy5")

        # extra components should be ignored
        k = RichLabel(Info(species="rat", seq_id="xy5"), "%(species)s")
        self.assertEqual(k, "rat")

        # the label should have Info object
        self.assertEqual(k.info.species, "rat")
        self.assertEqual(k.info.seq_id, "xy5")

        # label should be constructable just like a normal string
        self.assertEqual(RichLabel("a"), "a")
コード例 #2
0
ファイル: test_rdb.py プロジェクト: rahulghangas/cogent3
    def test_single_constructor(self):
        """RdbParser should use constructors if supplied"""
        to_dna = lambda x, info: DnaSequence(str(x).replace("U", "T"), info=info)
        f = list(RdbParser(self.oneseq, to_dna))
        self.assertEqual(len(f), 1)
        a = f[0]
        self.assertEqual(a, "AGTCATCTAGATHCATHC")
        self.assertEqual(
            a.info, Info({"Species": "H.Sapiens", "OriginalSeq": "AGUCAUCUAGAUHCAUHC"})
        )

        def alternativeConstr(header_lines):
            info = Info()
            for line in header_lines:
                all = line.strip().split(":", 1)
                # strip out empty lines, lines without name, lines without
                # colon
                if not all[0] or len(all) != 2:
                    continue
                name = all[0].upper()
                value = all[1].strip().upper()
                info[name] = value
            return info

        f = list(RdbParser(self.oneseq, to_dna, alternativeConstr))
        self.assertEqual(len(f), 1)
        a = f[0]
        self.assertEqual(a, "AGTCATCTAGATHCATHC")
        exp_info = Info(
            {"OriginalSeq": "AGUCAUCUAGAUHCAUHC", "Refs": {}, "SEQ": "H.SAPIENS"}
        )
        self.assertEqual(
            a.info,
            Info({"OriginalSeq": "AGUCAUCUAGAUHCAUHC", "Refs": {}, "SEQ": "H.SAPIENS"}),
        )
コード例 #3
0
 def test_update(self):
     """update should warn the user of overlapping keys"""
     with warnings.catch_warnings(record=True) as w:
         d1 = Info({"key1": "value1", "key2": "value2", "key3": "value3"})
         d2 = Info({"key2": "value2", "key3": "value3", "key4": "value4"})
         d3 = d1.update(d2)
         self.assertEqual(len(w), 1)
コード例 #4
0
    def test_single(self):
        """RdbParser should read single record as (header,seq) tuple"""
        res = list(RdbParser(self.oneseq))
        self.assertEqual(len(res), 1)
        first = res[0]
        self.assertEqual(first, Sequence("AGUCAUCUAGAUHCAUHC"))
        self.assertEqual(
            first.info,
            Info({
                "Species": "H.Sapiens",
                "OriginalSeq": "AGUCAUCUAGAUHCAUHC"
            }),
        )

        res = list(RdbParser(self.multiline))
        self.assertEqual(len(res), 1)
        first = res[0]
        self.assertEqual(first, Sequence("AGUCAUUAGAUHCAUHC"))
        self.assertEqual(
            first.info,
            Info({
                "Species": "H.Sapiens",
                "OriginalSeq": "AGUCAUUAGAUHCAUHC"
            }),
        )
コード例 #5
0
def GroupFastaParser(
    data,
    label_to_name,
    group_key="Group",
    aligned=False,
    moltype=ASCII,
    done_groups=None,
    DEBUG=False,
):
    """yields related sequences as a separate seq collection

    Parameters
    ----------
    data
        line iterable data source
    label_to_name
        LabelParser callback
    group_key
        name of group key in RichLabel.info object
    aligned
        whether sequences are to be considered aligned
    moltype
        default is ASCII
    done_groups
        series of group keys to be excluded

        """

    done_groups = [[], done_groups][done_groups is not None]
    parser = MinimalFastaParser(data,
                                label_to_name=label_to_name,
                                finder=XmfaFinder)
    group_ids = []
    current_collection = {}
    for label, seq in parser:
        seq = moltype.make_seq(seq, name=label, info=label.info)
        if DEBUG:
            print("str(label) ", str(label), "repr(label)", repr(label))
        if not group_ids or label.info[group_key] in group_ids:
            current_collection[label] = seq
            if not group_ids:
                group_ids.append(label.info[group_key])
        else:
            # we finish off check of current before creating a collection
            if group_ids[-1] not in done_groups:
                info = Info(Group=group_ids[-1])
                if DEBUG:
                    print("GroupParser collection keys",
                          list(current_collection.keys()))
                seqs = cogent3.make_aligned_seqs(current_collection,
                                                 moltype=moltype)
                seqs.info = info
                yield seqs
            current_collection = {label: seq}
            group_ids.append(label.info[group_key])
    info = Info(Group=group_ids[-1])
    func = cogent3.make_aligned_seqs if aligned else cogent3.make_unaligned_seqs
    seqs = func(current_collection, moltype=moltype, info=info)
    yield seqs
コード例 #6
0
    def test_full(self):
        """RdbParser: full data, valid and invalid"""
        # when only good record, should work independent of strict
        r1 = RnaSequence(
            "-??GG-UGAA--CGCU---ACGU-N???---",
            info=Info({
                "Species": "unidentified Thermus OPB AF027020",
                "Refs": {
                    "rRNA": ["AF027020"]
                },
                "OriginalSeq": "-o[oGG-U{G}AA--C^GC]U---ACGU-Nooo---",
            }),
        )
        r2 = RnaSequence(
            "---CGAUCG--UAUACG-N???-",
            info=Info({
                "Species": "Thermus silvanus X84211",
                "Refs": {
                    "rRNA": ["X84211"]
                },
                "OriginalSeq": "---CGAU[C(G){--UA}U]ACG-Nooo-",
            }),
        )
        obs = list(RdbParser(RDB_LINES_ONLY_GOOD.split("\n"), strict=True))
        self.assertEqual(len(obs), 2)
        self.assertEqual(obs[0], r1)
        self.assertEqual(str(obs[0]), str(r1))
        self.assertEqual(obs[0].info, r1.info)
        self.assertEqual(obs[1], r2)
        self.assertEqual(str(obs[1]), str(r2))
        self.assertEqual(obs[1].info, r2.info)

        obs = list(RdbParser(RDB_LINES_ONLY_GOOD.split("\n"), strict=False))
        self.assertEqual(len(obs), 2)
        self.assertEqual(obs[0], r1)
        self.assertEqual(str(obs[0]), str(r1))
        self.assertEqual(obs[0].info, r1.info)

        # when strict, should raise error on invalid record
        f = RdbParser(RDB_LINES_GOOD_BAD.split("\n"), strict=True)
        self.assertRaises(RecordError, list, f)
        # when not strict, malicious record is skipped
        obs = list(RdbParser(RDB_LINES_GOOD_BAD.split("\n"), strict=False))
        self.assertEqual(len(obs), 2)
        self.assertEqual(obs[0], r1)
        self.assertEqual(str(obs[0]), str(r1))
        self.assertEqual(obs[0].info, r1.info)
        self.assertEqual(obs[1], r2)
        self.assertEqual(str(obs[1]), str(r2))
        self.assertEqual(obs[1].info, r2.info)
コード例 #7
0
def NcbiFastaLabelParser(line):
    """Creates an Info object and populates it with the line contents.

    As of 11/12/03, all records in genpept.fsa and the human RefSeq fasta
    files were consistent with this format.
    """
    info = Info()
    try:
        ignore, gi, db, db_ref, description = list(map(strip, line.split("|", 4)))
    except ValueError:  # probably got wrong value
        raise RecordError("Unable to parse label line %s" % line)
    info.GI = gi
    info[NcbiLabels[db]] = db_ref
    info.Description = description
    return gi, info
コード例 #8
0
 def setUp(self):
     """Setup for Fasta tests."""
     self.strings = ["AAAA", "CCCC", "gggg", "uuuu"]
     self.labels = ["1st", "2nd", "3rd", "4th"]
     self.infos = ["Dog", "Cat", "Mouse", "Rat"]
     self.sequences_with_labels = list(map(Sequence, self.strings))
     self.sequences_with_names = list(map(Sequence, self.strings))
     for l, sl, sn in zip(self.labels, self.sequences_with_labels,
                          self.sequences_with_names):
         sl.label = l
         sn.name = l
     self.fasta_no_label = ">0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu\n"
     self.fasta_with_label = ">1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU\n"
     self.fasta_with_label_lw2 = (
         ">1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU\n")
     self.alignment_dict = {
         "1st": "AAAA",
         "2nd": "CCCC",
         "3rd": "GGGG",
         "4th": "UUUU",
     }
     self.alignment_object = Alignment(self.alignment_dict)
     for label, info in zip(self.labels, self.infos):
         self.alignment_object.named_seqs[label].info = Info(species=info)
     self.fasta_with_label_species = (
         ">1st:Dog\nAAAA\n>2nd:Cat\nCCCC\n>3rd:Mouse\nGGGG\n>4th:Rat\nUUUU\n"
     )
     self.alignment_object.RowOrder = ["1st", "2nd", "3rd", "4th"]
コード例 #9
0
 def test_init_empty(self):
     """Info empty init should work as expected"""
     d = Info()
     self.assertEqual(len(d), 1)
     self.assertContains(d, "Refs")
     self.assertEqual(d.Refs, DbRefs())
     self.assertTrue(isinstance(d.Refs, DbRefs))
コード例 #10
0
ファイル: test_rdb.py プロジェクト: rahulghangas/cogent3
    def test_multiple_constructor_bad(self):
        """RdbParser should complain or skip bad records w/ constructor"""

        def dnastrict(x, **kwargs):
            try:
                return DnaSequence(x, **kwargs)
            except Exception:
                raise RecordError("Could not convert sequence")

        self.assertRaises(RecordError, list, RdbParser(self.oneX, dnastrict))
        f = list(RdbParser(self.oneX, dnastrict, strict=False))
        self.assertEqual(len(f), 2)
        a, b = f

        self.assertEqual(a, "ACT")
        self.assertEqual(a.info, Info({"Species": "mit", "OriginalSeq": "ACT"}))
        self.assertEqual(b, "AAA")
        self.assertEqual(b.info, Info({"Species": "pla", "OriginalSeq": "AAA"}))
コード例 #11
0
 def alternativeConstr(header_lines):
     info = Info()
     for line in header_lines:
         all = line.strip().split(":", 1)
         # strip out empty lines, lines without name, lines without
         # colon
         if not all[0] or len(all) != 2:
             continue
         name = all[0].upper()
         value = all[1].strip().upper()
         info[name] = value
     return info
コード例 #12
0
 def test_init_data(self):
     """Info init with data should put items in correct places"""
     # need to check init, setting, and resetting of attributes that belong
     # in the Info object and attributes that belong in Info.Refs. Also need
     # to check __getitem__, __setitem__, and __contains__.
     d = Info({"x": 3, "GO": 12345})
     self.assertEqual(d.x, 3)
     self.assertEqual(d.GO, [12345])
     self.assertEqual(d.Refs.GO, [12345])
     try:
         del d.Refs
     except AttributeError:
         pass
     else:
         raise Exception("Failed to prevent deletion of required key Refs"
                         "")
     d.GenBank = ("qaz", "wsx")
     self.assertEqual(d.GenBank, ["qaz", "wsx"])
     self.assertContains(d.Refs, "GenBank")
     self.assertContains(d, "GenBank")
     d.GenBank = "xyz"
     self.assertEqual(d.GenBank, ["xyz"])
     self.assertSameObj(d.GenBank, d.Refs.GenBank)
     d.GO = "x"
     self.assertEqual(d.GO, ["x"])
     d.GO.append("y")
     self.assertEqual(d.GO, ["x", "y"])
     d.ZZZ = "zzz"
     self.assertEqual(d.ZZZ, "zzz")
     self.assertNotContains(d.Refs, "ZZZ")
     self.assertNotContains(d, "XXX")
     self.assertEqual(d.XXX, None)
コード例 #13
0
def InfoMaker(header_lines):
    """Returns an Info object constructed from the headerLines."""
    info = Info()
    for line in header_lines:
        all = line.strip().split(":", 1)
        # strip out empty lines, lines without name, lines without colon
        if not all[0] or len(all) != 2:
            continue
        try:
            name = _field_names[all[0]]
        except KeyError:
            name = all[0]

        value = all[1].strip()
        info[name] = value
    return info
コード例 #14
0
 def call(label):
     label = [label, label[1:]][label[0] == ">"]
     label = sep.split(label)
     if DEBUG:
         print(label)
     info = Info()
     for index, name, converter in field_formatters:
         if isinstance(converter, Callable):
             try:
                 info[name] = converter(label[index])
             except IndexError:
                 raise IndexError(
                     "parsing label %s failed for property %s at index %s" %
                     (label, name, index))
         else:
             info[name] = label[index]
     return RichLabel(info, display_template)
コード例 #15
0
 def test_full(self):
     """InfoMaker should return Info object with name, value pairs"""
     test_header = [
         "acc: X3402",
         "abc:1",
         "mty: ssu",
         "seq: Mit. X3402",
         "",
         "nonsense",
         ":no_name",
     ]
     obs = InfoMaker(test_header)
     exp = Info()
     exp.rRNA = "X3402"
     exp.abc = "1"
     exp.Species = "Mit. X3402"
     exp.Gene = "ssu"
     self.assertEqual(obs, exp)
コード例 #16
0
ファイル: genbank.py プロジェクト: jbw900/cogent3
def RichGenbankParser(handle,
                      info_excludes=None,
                      moltype=None,
                      skip_contigs=False,
                      add_annotation=None):
    """Returns annotated sequences from GenBank formatted file.

    Parameters
    ----------
    info_excludes
        a series of fields to be excluded from the Info object
    moltype
        a MolType instance, such as PROTEIN, DNA. Default is ASCII.
    skip_contigs
        ignores records with no actual sequence data, typically
        a genomic contig.
    add_annotation
        a callback function to create an new annotation from a
        GenBank feature. Function is called with the sequence, a feature dict
        and the feature spans.

    """
    info_excludes = info_excludes or []
    moltype = moltype or ASCII
    for rec in MinimalGenbankParser(handle):
        info = Info()
        # populate the info object, excluding the sequence
        for label, value in list(rec.items()):
            if label in info_excludes:
                continue
            info[label] = value

        if rec["mol_type"] == "protein":  # which it doesn't for genbank
            moltype = PROTEIN
        elif rec["mol_type"] == "DNA":
            moltype = DNA

        try:
            seq = moltype.make_seq(rec["sequence"].upper(),
                                   info=info,
                                   name=rec["locus"])
        except KeyError:
            if not skip_contigs:
                if "contig" in rec:
                    yield rec["locus"], rec["contig"]
                elif "WGS" in rec:
                    yield rec["locus"], rec["WGS"]
                else:
                    yield rec["locus"], None
            continue

        for feature in rec["features"]:
            spans = []
            reversed = None
            if feature["location"] is None or feature["type"] in [
                    "source", "organism"
            ]:
                continue
            for location in feature["location"]:
                (lo, hi) = (location.first() - 1, location.last())
                if location.Strand == -1:
                    (lo, hi) = (hi, lo)
                    assert reversed is not False
                    reversed = True
                else:
                    assert reversed is not True
                    reversed = False
                # ensure we don't put in a span that starts beyond the sequence
                if lo > len(seq):
                    continue
                # or that's longer than the sequence
                hi = [hi, len(seq)][hi > len(seq)]
                spans.append((lo, hi))

            if add_annotation:
                add_annotation(seq, feature, spans)
            else:
                for id_field in ["gene", "note", "product", "clone"]:
                    if id_field in feature:
                        name = feature[id_field]
                        if not isinstance(name, str):
                            name = " ".join(name)
                        break
                else:
                    name = None
                seq.add_annotation(Feature, feature["type"], name, spans)

        yield (rec["locus"], seq)
コード例 #17
0
 def test_identity(self):
     """Info should get its own new Refs when created"""
     i = Info()
     j = Info()
     self.assertNotSameObj(i, j)
     self.assertNotSameObj(i.Refs, j.Refs)
コード例 #18
0
 def test_empty(self):
     """InfoMaker: should return empty Info from empty header"""
     empty_header = []
     obs = InfoMaker(empty_header)
     exp = Info()
     self.assertEqual(obs, exp)