Esempio n. 1
0
    def test_NameToInfo_invalid_label(self):
        """NameToInfo: raises error on invalid label """
        s = 'AA'
        invalid_labels = ['U17136.1898-984','U17136.1/898984']
        for l in invalid_labels:
            self.assertRaises(RecordError,NameToInfo,\
                Sequence(s, Name=l))
        a = 'U17136.1/' #missing start/end positions
        b = '/898-984' #missing genbank id
        obs_info = NameToInfo(Sequence(s,Name=a))
        exp = Info({'GenBank':'U17136.1','Start':None,'End':None})
        self.assertEqual(obs_info,exp)
        obs_info = NameToInfo(Sequence(s,Name=b))
        exp = Info({'GenBank':None,'Start':897,'End':984})
        self.assertEqual(obs_info,exp)

        #strict = False
        # in strict mode you want to get back as much info as possible
        lab1 = 'U17136.1898-984'
        lab2 = 'U17136.1/898984'
        obs_info = NameToInfo(Sequence(s,Name=lab1), strict=False)
        exp = Info({'GenBank':None,'Start':None,'End':None})
        self.assertEqual(obs_info,exp)
        obs_info = NameToInfo(Sequence(s,Name=lab2), strict=False)
        exp = Info({'GenBank':'U17136.1','Start':None,'End':None})
        self.assertEqual(obs_info,exp)
Esempio n. 2
0
    def test_single_constructor(self):
        """RdbParser should use constructors if supplied"""
        to_dna = lambda x, Info: DnaSequence(str(x).replace('U','T'), \
            Info=Info)
        f = list(RdbParser(self.oneseq, to_dna))
        self.assertEqual(len(f), 1)
        a = f[0]
        self.assertEqual(a, 'AGTCATCTAGATHCATHC')
        self.assertEqual(a.Info, Info({'Species':'H.Sapiens',\
            'OriginalSeq':'AGUCAUCUAGAUHCAUHC'}))

        def alternativeConstr(header_lines):
            info = Info()
            for line in header_lines:
                all = line.strip().split(':', 1)
                #strip out empty lines, lines without name, lines without colon
                if not all[0] or len(all) != 2:
                    continue
                name = all[0].upper()
                value = all[1].strip().upper()
                info[name] = value
            return info

        f = list(RdbParser(self.oneseq, to_dna, alternativeConstr))
        self.assertEqual(len(f), 1)
        a = f[0]
        self.assertEqual(a, 'AGTCATCTAGATHCATHC')
        exp_info = Info({'OriginalSeq':'AGUCAUCUAGAUHCAUHC',\
            'Refs':{}, 'SEQ':'H.SAPIENS'})
        self.assertEqual(a.Info, Info({'OriginalSeq':'AGUCAUCUAGAUHCAUHC',\
            'Refs':{}, 'SEQ':'H.SAPIENS'}))
Esempio n. 3
0
    def test_rich_label(self):
        """rich label correctly constructs label strings"""
        # labels should be equal based on the result of applying their
        # attributes to their string template
        k = RichLabel(Info(species="rat"), "%(species)s")
        l = RichLabel(Info(species="rat", seq_id="xy5"), "%(species)s")
        self.assertEqual(k, l)

        # labels should construct from Info components correctly
        k = RichLabel(Info(species="rat", seq_id="xy5"),
                      "%(seq_id)s:%(species)s")
        self.assertEqual(k, "xy5:rat")
        k = RichLabel(Info(species="rat", seq_id="xy5"),
                      "%(species)s:%(seq_id)s")
        self.assertEqual(k, "rat:xy5")

        # extra components should be ignored
        k = RichLabel(Info(species="rat", seq_id="xy5"), "%(species)s")
        self.assertEqual(k, "rat")

        # the label should have Info object
        self.assertEqual(k.Info.species, "rat")
        self.assertEqual(k.Info.seq_id, "xy5")

        # label should be constructable just like a normal string
        self.assertEqual(RichLabel('a'), 'a')
Esempio n. 4
0
 def test_init_data(self):
     """Info init with data should put items in correct places"""
     #need to check init, setting, and resetting of attributes that belong
     #in the Info object and attributes that belong in Info.Refs. Also need
     #to check __getitem__, __setitem__, and __contains__.
     d = Info({'x':3, 'GO':12345})
     self.assertEqual(d.x, 3)
     self.assertEqual(d.GO, [12345])
     self.assertEqual(d.Refs.GO, [12345])
     try:
         del d.Refs
     except AttributeError:
         pass
     else:
         raise Exception, "Failed to prevent deletion of required key Refs"""
     d.GenBank = ('qaz', 'wsx')
     self.assertEqual(d.GenBank, ['qaz', 'wsx'])
     self.assertContains(d.Refs, 'GenBank')
     self.assertContains(d, 'GenBank')
     d.GenBank = 'xyz'
     self.assertEqual(d.GenBank, ['xyz'])
     self.assertSameObj(d.GenBank, d.Refs.GenBank)
     d.GO = 'x'
     self.assertEqual(d.GO, ['x'])
     d.GO.append('y')
     self.assertEqual(d.GO, ['x', 'y'])
     d.ZZZ = 'zzz'
     self.assertEqual(d.ZZZ, 'zzz')
     self.assertNotContains(d.Refs, 'ZZZ')
     self.assertNotContains(d, 'XXX')
     self.assertEqual(d.XXX, None)
Esempio n. 5
0
def CutgSpeciesParser(infile, strict=True, constructor=CodonUsage):
    """Yields successive sequences from infile as CodonUsage objects.

    If strict is True (default), raises RecordError when label or seq missing.
    """
    if not strict:  #easier to see logic without detailed error handling
        for rec in CutgSpeciesFinder(infile):
            try:
                label, counts = rec
                if not is_cutg_species_label(label):
                    continue
                species, genes = species_label_splitter(label)
                info = Info({'Species': species, 'NumGenes': int(genes)})
                freqs = constructor(zip(codon_order, map(int, counts.split())),
                                    Info=info)
                yield freqs
            except:
                continue
    else:
        for rec in CutgSpeciesFinder(infile):
            try:
                label, counts = rec
            except ValueError:  #can't have got any counts
                raise RecordError, "Found label without sequences: %s" % rec

            if not is_cutg_species_label(label):
                raise RecordError, "Found CUTG record without label: %s" % rec
            species, genes = species_label_splitter(label)
            info = Info({'Species': species, 'NumGenes': int(genes)})
            try:
                d = zip(codon_order, map(int, counts.split()))
                freqs = constructor(d, Info=info)
            except:
                raise RecordError, "Unable to convert counts: %s" % counts
            yield freqs
Esempio n. 6
0
 def test_full(self):
     """InfoMaker should return Info object with name, value pairs"""
     test_header = ['acc: X3402','abc:1','mty: ssu','seq: Mit. X3402',\
                     '','nonsense',':no_name']
     obs = InfoMaker(test_header)
     exp = Info()
     exp.rRNA = 'X3402'
     exp.abc = '1'
     exp.Species = 'Mit. X3402'
     exp.Gene = 'ssu'
     self.assertEqual(obs,exp)
Esempio n. 7
0
def GroupFastaParser(data,
                     label_to_name,
                     group_key="Group",
                     aligned=False,
                     moltype=ASCII,
                     done_groups=None,
                     DEBUG=False):
    """yields related sequences as a separate seq collection
    
    Arguments:
        - data: line iterable data source
        - label_to_name: LabelParser callback
        - group_key: name of group key in RichLabel.Info object
        - aligned: whether sequences are to be considered aligned
        - moltype: default is ASCII
        - done_groups: series of group keys to be excluded
        """

    done_groups = [[], done_groups][done_groups is not None]
    parser = MinimalFastaParser(data,
                                label_to_name=label_to_name,
                                finder=XmfaFinder)
    group_ids = []
    current_collection = {}
    for label, seq in parser:
        seq = moltype.makeSequence(seq, Name=label, Info=label.Info)
        if DEBUG:
            print "str(label) ", str(label), "repr(label)", repr(label)
        if not group_ids or label.Info[group_key] in group_ids:
            current_collection[label] = seq
            if not group_ids:
                group_ids.append(label.Info[group_key])
        else:
            # we finish off check of current before creating a collection
            if group_ids[-1] not in done_groups:
                info = Info(Group=group_ids[-1])
                if DEBUG:
                    print "GroupParser collection keys", current_collection.keys(
                    )
                seqs = cogent.LoadSeqs(data=current_collection,
                                       moltype=moltype,
                                       aligned=aligned)
                seqs.Info = info
                yield seqs
            current_collection = {label: seq}
            group_ids.append(label.Info[group_key])
    info = Info(Group=group_ids[-1])
    seqs = cogent.LoadSeqs(data=current_collection,
                           moltype=moltype,
                           aligned=aligned)
    seqs.Info = info
    yield seqs
Esempio n. 8
0
def NcbiFastaLabelParser(line):
    """Creates an Info object and populates it with the line contents.
    
    As of 11/12/03, all records in genpept.fsa and the human RefSeq fasta
    files were consistent with this format.
    """
    info = Info()
    try:
        ignore, gi, db, db_ref, description = map(strip, line.split('|', 4))
    except ValueError:  #probably got wrong value
        raise RecordError, "Unable to parse label line %s" % line
    info.GI = gi
    info[NcbiLabels[db]] = db_ref
    info.Description = description
    return gi, info
Esempio n. 9
0
    def test_single(self):
        """RdbParser should read single record as (header,seq) tuple"""
        res = list(RdbParser(self.oneseq))
        self.assertEqual(len(res), 1)
        first = res[0]
        self.assertEqual(first, Sequence('AGUCAUCUAGAUHCAUHC'))
        self.assertEqual(first.Info, Info({'Species':'H.Sapiens',\
            'OriginalSeq':'AGUCAUCUAGAUHCAUHC'}))

        res = list(RdbParser(self.multiline))
        self.assertEqual(len(res), 1)
        first = res[0]
        self.assertEqual(first, Sequence('AGUCAUUAGAUHCAUHC'))
        self.assertEqual(first.Info, Info({'Species':'H.Sapiens',\
            'OriginalSeq':'AGUCAUUAGAUHCAUHC'}))
Esempio n. 10
0
def NcbiFastaLabelParser(line):
    """Creates an Info object and populates it with the line contents.
    
    As of 11/12/03, all records in genpept.fsa and the human RefSeq fasta
    files were consistent with this format.
    """
    info = Info()
    try:
        ignore, gi, db, db_ref, description = map(strip, line.split('|', 4))
    except ValueError:  #probably got wrong value
        raise RecordError, "Unable to parse label line %s" % line
    info.GI = gi
    info[NcbiLabels[db]] = db_ref
    info.Description = description
    return gi, info
Esempio n. 11
0
def parse_header(header_lines):
    """Return Info object from header information.
   
    header_lines -- list of lines or anything that behaves like it.

    Parses only the first three header lines with Filename, Organism, and
    Accession number. In general lines that contain a colon will be parsed.
    There's no error checking in here. If it fails to split on ':', the 
    information is simply not added to the dictionary. The expected format
    for header lines is "key: value". The citation lane is parsed differently. 
    """
    info = {}
    for line in header_lines:
        if line.startswith('Citation'):
            info['Citation'] = line.split()[-1].strip()
        elif ':' in line:
            try:
                field, value = map(strip, line.split(':', 1))
                info[field] = value
            except ValueError:
                #no interesting header line
                continue
        else:
            continue
    return Info(info)
Esempio n. 12
0
 def test_init_empty(self):
     """Info empty init should work as expected"""
     d = Info()
     self.assertEqual(len(d), 1)
     self.assertContains(d, 'Refs')
     self.assertEqual(d.Refs, DbRefs())
     self.assertTrue(isinstance(d.Refs, DbRefs))
Esempio n. 13
0
 def setUp(self):
     """Setup for Fasta tests."""
     self.strings = ['AAAA', 'CCCC', 'gggg', 'uuuu']
     self.labels = ['1st', '2nd', '3rd', '4th']
     self.infos = ["Dog", "Cat", "Mouse", "Rat"]
     self.sequences_with_labels = map(Sequence, self.strings)
     self.sequences_with_names = map(Sequence, self.strings)
     for l,sl,sn in zip(self.labels,self.sequences_with_labels,\
         self.sequences_with_names):
         sl.Label = l
         sn.Name = l
     self.fasta_no_label = '>0\nAAAA\n>1\nCCCC\n>2\ngggg\n>3\nuuuu'
     self.fasta_with_label=\
             '>1st\nAAAA\n>2nd\nCCCC\n>3rd\nGGGG\n>4th\nUUUU'
     self.fasta_with_label_lw2=\
         '>1st\nAA\nAA\n>2nd\nCC\nCC\n>3rd\nGG\nGG\n>4th\nUU\nUU'
     self.alignment_dict = {
         '1st': 'AAAA',
         '2nd': 'CCCC',
         '3rd': 'GGGG',
         '4th': 'UUUU'
     }
     self.alignment_object = Alignment(self.alignment_dict)
     for label, info in zip(self.labels, self.infos):
         self.alignment_object.NamedSeqs[label].Info = Info(species=info)
     self.fasta_with_label_species=\
           '>1st:Dog\nAAAA\n>2nd:Cat\nCCCC\n>3rd:Mouse\nGGGG\n>4th:Rat\nUUUU'
     self.alignment_object.RowOrder = ['1st', '2nd', '3rd', '4th']
Esempio n. 14
0
 def alternativeConstr(header_lines):
     info = Info()
     for line in header_lines:
         all = line.strip().split(':', 1)
         #strip out empty lines, lines without name, lines without colon
         if not all[0] or len(all) != 2:
             continue
         name = all[0].upper()
         value = all[1].strip().upper()
         info[name] = value
     return info
Esempio n. 15
0
 def test_RfamParser_single_family(self):
     """RfamParser: should work on a single family in stockholm format"""
     exp_header = Info()
     exp_aln = {'K02120.1/628-682':\
         'AUGGGAAAUUCCCCCUCCUAUAACCCCCCCGCUGGUAUCUCCCCCUCAGACUGGC',\
         'D00647.1/629-683':\
         'AUGGGAAACUCCCCCUCCUAUAACCCCCCCGCUGGCAUCUCCCCCUCAGACUGGC'}
     exp_struct = '<<<<<<.........>>>>>>.........<<<<<<.............>>>>>>'
     h, a, s = list(RfamParser(self.single_family))[0]
     self.assertEqual(h,exp_header)
     self.assertEqual(a,exp_aln)
     self.assertEqual(s,exp_struct)
Esempio n. 16
0
 def test_init_data(self):
     """Info init with data should put items in correct places"""
     #need to check init, setting, and resetting of attributes that belong
     #in the Info object and attributes that belong in Info.Refs. Also need
     #to check __getitem__, __setitem__, and __contains__.
     d = Info({'x': 3, 'GO': 12345})
     self.assertEqual(d.x, 3)
     self.assertEqual(d.GO, [12345])
     self.assertEqual(d.Refs.GO, [12345])
     try:
         del d.Refs
     except AttributeError:
         pass
     else:
         raise Exception, "Failed to prevent deletion of required key Refs" ""
     d.GenBank = ('qaz', 'wsx')
     self.assertEqual(d.GenBank, ['qaz', 'wsx'])
     self.assertContains(d.Refs, 'GenBank')
     self.assertContains(d, 'GenBank')
     d.GenBank = 'xyz'
     self.assertEqual(d.GenBank, ['xyz'])
     self.assertSameObj(d.GenBank, d.Refs.GenBank)
     d.GO = 'x'
     self.assertEqual(d.GO, ['x'])
     d.GO.append('y')
     self.assertEqual(d.GO, ['x', 'y'])
     d.ZZZ = 'zzz'
     self.assertEqual(d.ZZZ, 'zzz')
     self.assertNotContains(d.Refs, 'ZZZ')
     self.assertNotContains(d, 'XXX')
     self.assertEqual(d.XXX, None)
Esempio n. 17
0
    def test_full(self):
        """RdbParser: full data, valid and invalid"""
        # when only good record, should work independent of strict
        r1 = RnaSequence("-??GG-UGAA--CGCU---ACGU-N???---",\
            Info=Info({'Species': "unidentified Thermus OPB AF027020",\
            'Refs':{'rRNA':['AF027020']},\
            'OriginalSeq':'-o[oGG-U{G}AA--C^GC]U---ACGU-Nooo---'}))
        r2 = RnaSequence("---CGAUCG--UAUACG-N???-",\
            Info=Info({'Species':'Thermus silvanus X84211',\
            'Refs':{'rRNA':['X84211']},\
            'OriginalSeq':'---CGAU[C(G){--UA}U]ACG-Nooo-'}))
        obs = list(RdbParser(RDB_LINES_ONLY_GOOD.split('\n'), strict=True))
        self.assertEqual(len(obs), 2)
        self.assertEqual(obs[0], r1)
        self.assertEqual(str(obs[0]), str(r1))
        self.assertEqual(obs[0].Info, r1.Info)
        self.assertEqual(obs[1], r2)
        self.assertEqual(str(obs[1]), str(r2))
        self.assertEqual(obs[1].Info, r2.Info)

        obs = list(RdbParser(RDB_LINES_ONLY_GOOD.split('\n'), strict=False))
        self.assertEqual(len(obs), 2)
        self.assertEqual(obs[0], r1)
        self.assertEqual(str(obs[0]), str(r1))
        self.assertEqual(obs[0].Info, r1.Info)

        # when strict, should raise error on invalid record
        f = RdbParser(RDB_LINES_GOOD_BAD.split('\n'), strict=True)
        self.assertRaises(RecordError, list, f)
        # when not strict, malicious record is skipped
        obs = list(RdbParser(RDB_LINES_GOOD_BAD.split('\n'), strict=False))
        self.assertEqual(len(obs), 2)
        self.assertEqual(obs[0], r1)
        self.assertEqual(str(obs[0]), str(r1))
        self.assertEqual(obs[0].Info, r1.Info)
        self.assertEqual(obs[1], r2)
        self.assertEqual(str(obs[1]), str(r2))
        self.assertEqual(obs[1].Info, r2.Info)
Esempio n. 18
0
    def test_multiple_constructor_bad(self):
        """RdbParser should complain or skip bad records w/ constructor"""
        def dnastrict(x, **kwargs):
            try:
                return DnaSequence(x, **kwargs)
            except Exception:
                raise RecordError, "Could not convert sequence"

        self.assertRaises(RecordError, list, RdbParser(self.oneX, dnastrict))
        f = list(RdbParser(self.oneX, dnastrict, strict=False))
        self.assertEqual(len(f), 2)
        a, b = f

        self.assertEqual(a, 'ACT')
        self.assertEqual(a.Info, Info({
            'Species': 'mit',
            'OriginalSeq': 'ACT'
        }))
        self.assertEqual(b, 'AAA')
        self.assertEqual(b.Info, Info({
            'Species': 'pla',
            'OriginalSeq': 'AAA'
        }))
Esempio n. 19
0
 def call(label):
     label = [label, label[1:]][label[0] == ">"]
     label = sep.split(label)
     if DEBUG:
         print(label)
     info = Info()
     for index, name, converter in field_formatters:
         if isinstance(converter, collections.Callable):
             try:
                 info[name] = converter(label[index])
             except IndexError:
                 raise IndexError('parsing label %s failed for property %s at index %s' % (label, name, index))
         else:
             info[name] = label[index]
     return RichLabel(info, display_template)
Esempio n. 20
0
def InfoMaker(header_lines):
    """Returns an Info object constructed from the headerLines."""
    info = Info()
    for line in header_lines:
        all = line.strip().split(':', 1)
        #strip out empty lines, lines without name, lines without colon
        if not all[0] or len(all) != 2:
            continue
        try:
            name = _field_names[all[0]]
        except KeyError:
            name = all[0]

        value = all[1].strip()
        info[name] = value
    return info
Esempio n. 21
0
 def call(label):
     label = [label, label[1:]][label[0] == ">"]
     label = sep.split(label)
     if DEBUG:
         print label
     info = Info()
     for index, name, converter in field_formatters:
         if callable(converter):
             try:
                 info[name] = converter(label[index])
             except IndexError:
                 print label, index, name
                 raise
         else:
             info[name] = label[index]
     return RichLabel(info, display_template)
Esempio n. 22
0
def InfoFromLabel(line):
    """Takes a CUTG codon description line and returns an Info object.

    Raises RecordError if wrong number of fields etc.
    """
    try:
        raw_fields = line.split('\\')
        result = Info(
            dict(list(zip(field_order, list(map(strip, raw_fields[1:]))))))
        #extra processing for first field
        first = raw_fields[0]
        if '#' in first:
            locus, cds_num = list(map(strip, raw_fields[0].split('#')))
        else:
            locus, cds_num = first, '1'
        result['Locus'] = locus[1:]  #remove leading '>'
        result['CdsNumber'] = cds_num
        #additional processing for last field: mostly key="value" pairs
        description = result['Description']
        descrs = description.split('/')
        for d in descrs:
            if '=' in d:  #assume key-value pair
                key, val = list(map(strip, d.split('=',
                                                   1)))  #might be '=' in value
                #cut off leading and trailing " if present, but _not_ internal!
                if val.startswith('"'):
                    val = val[1:]
                if val.endswith('"'):
                    val = val[:-1]
                if key == 'db_xref':  #handle cross-refs specially
                    try:
                        key, val = val.split(':')
                    except ValueError:  #missing actual reference?
                        continue  #just skip the bad db records
                    try:
                        if result[key]:
                            result[key].append(val)
                        else:
                            result[key] = [val]
                    except (KeyError, TypeError):  #didn't recognize database
                        result[key] = val
                else:
                    #remember to convert the key to MixedCase naming convention
                    result[cfu(key)] = val
        return result
    except:
        raise RecordError("Failed to read label line:\n%s" % line)
Esempio n. 23
0
 def test_full(self):
     """InfoMaker should return Info object with name, value pairs"""
     test_header = ['acc: X3402','abc:1','mty: ssu','seq: Mit. X3402',\
                     '','nonsense',':no_name']
     obs = InfoMaker(test_header)
     exp = Info()
     exp.rRNA = 'X3402'
     exp.abc = '1'
     exp.Species = 'Mit. X3402'
     exp.Gene = 'ssu'
     self.assertEqual(obs, exp)
Esempio n. 24
0
def NameToInfo(sequence, strict=True):
    """Returns an Info object constructed from the sequence Name

    sequence: Sequence object with a Name attribute

    The label will be split on Genbank acc. no. and sequence coordinates.
    The coordinates will be shifted one position, since in Python the first
        position is 0.
    """
    #adjust label
    label = sequence.Name
    try:
        gb, pos = label.split('/', 1)  #split genbank label and pos
        if not gb:
            gb = None
        if not pos:
            pos = None
    except:  #unable to split, so string doesn't contain '/'
        if strict:
            raise RecordError, "Failed to extract genbank id and positions" +\
            " from label %s"%label
        else:
            gb = None
            pos = None
    if pos:
        try:
            start, end = pos.split('-', 1)  #split start and end pos
        except:
            if strict:
                raise RecordError,\
                    "Failed to extract genbank id and positions from label %s"\
                    %label
            else:
                start = None
                end = None
    else:
        start = None
        end = None
    if start:
        # adjust start position to do the correct thing in python
        # see comment in docstring
        start = int(start) - 1
    if end:
        end = int(end)
    info = Info({'GenBank': gb, 'Start': start, 'End': end})
    return info
Esempio n. 25
0
def HeaderToInfo(header, strict=True):
    """Returns an Info object constructed from the header lines.

    Header is a list of lines that contain header information.
    Fields that can occur multiple times in a header are stored in a list.
    Fields that (should) occur only once are stored as a single value
    Comments are joined by ' ' to one field.
    Fields concerning the references are ignored, except for MedLine ID.
    """
    # construct temporary dictionary containing all original information
    initial_info = {}
    for line in header:
        line = line.strip()
        if not line:
            continue
        try:
            init, label, content = line.split(' ', 2)
            if not init == '#=GF' or len(label) != 2:
                raise RecordError
        except:
            if strict:
                raise RecordError, "Failed to extract label and content " +\
                    "information from line %s"%(line)
            else:
                continue
        if label in ['BM', 'DR', 'RM', 'CC']:
            if label in initial_info:
                initial_info[label].append(content.strip())
            else:
                initial_info[label] = [content.strip()]
        else:
            initial_info[label] = content.strip()

    # transform initial dict into final one
    # throw away useless information; group information
    final_info = {}
    for key in initial_info.keys():
        name = _field_names.get(key, key)
        if name == 'Comment':
            value = ' '.join(initial_info[key])
        else:
            value = initial_info[key]
        final_info[name] = value

    return Info(final_info)
Esempio n. 26
0
def TreeAlign(model, seqs, tree=None, indel_rate=0.01, indel_length=0.01,
    ui = None, ests_from_pairwise=True, param_vals=None):
    """Returns a multiple alignment and tree.
    
    Uses the provided substitution model and a tree for determining the
    progressive order. If a tree is not provided a Neighbour Joining tree is
    constructed from pairwise distances estimated from pairwise aligning the
    sequences. If running in parallel, only the distance estimation is
    parallelised and only the master CPU returns the alignment and tree, other
    CPU's return None, None.
    
    Arguments:
        - model: a substitution model
        - seqs: a sequence collection
        - indel_rate, indel_length: parameters for the progressive pair-HMM
        - ests_from_pairwise: if no tree provided and True, the median value
          of the substitution model parameters are used
        - param_vals: named key, value pairs for model parameters. These
          override ests_from_pairwise.
    """
    _exclude_params = ['mprobs', 'rate', 'bin_switch']
    if param_vals:
        param_vals = dict(param_vals)
    else:
        param_vals = {}
    if isinstance(seqs, dict):
        seq_names = list(seqs.keys())
    else:
        seq_names = seqs.getSeqNames()
    
    two_seqs = len(seq_names) == 2
    
    if tree:
        tip_names = tree.getTipNames()
        tip_names.sort()
        seq_names.sort()
        assert tip_names == seq_names, \
            "names don't match between seqs and tree: tree=%s; seqs=%s" % \
            (tip_names, seq_names)
        ests_from_pairwise = False
    elif two_seqs:
        tree = LoadTree(tip_names=seqs.getSeqNames())
        ests_from_pairwise = False
    else:
        if ests_from_pairwise:
            est_params = [param for param in model.getParamList() \
                                    if param not in _exclude_params]
        else:
            est_params = None
        
        dcalc = EstimateDistances(seqs, model, do_pair_align=True,
                                    est_params=est_params)
        dcalc.run()
        dists = dcalc.getPairwiseDistances()
        tree = NJ.nj(dists)
    
    LF = model.makeLikelihoodFunction(tree.bifurcating(name_unnamed=True), aligned=False)
    if ests_from_pairwise and not param_vals:
        # we use the Median to avoid the influence of outlier pairs
        param_vals = {}
        for param in est_params:
            numbers = dcalc.getParamValues(param)
            print("Param Estimate Summary Stats: %s" % param)
            print(numbers.summarize())
            param_vals[param] = numbers.Median
    
    ui.display("Doing %s alignment" % ["progressive", "pairwise"][two_seqs])
    with LF.updatesPostponed():
        for param, val in list(param_vals.items()):
            LF.setParamRule(param, value=val, is_constant=True)
        LF.setParamRule('indel_rate', value=indel_rate, is_constant=True)
        LF.setParamRule('indel_length', value=indel_length, is_constant=True)
        LF.setSequences(seqs)
    edge = LF.getLogLikelihood().edge
    align = edge.getViterbiPath().getAlignment()
    info = Info()
    info["AlignParams"] = param_vals
    info["AlignParams"].update(dict(indel_length=indel_length, indel_rate=indel_rate))
    align.Info = info
    return align, tree
Esempio n. 27
0
 def test_empty(self):
     """InfoMaker: should return empty Info from empty header"""
     empty_header = []
     obs = InfoMaker(empty_header)
     exp = Info()
     self.assertEqual(obs, exp)
Esempio n. 28
0
 def test_identity(self):
     """Info should get its own new Refs when created"""
     i = Info()
     j = Info()
     self.assertNotSameObj(i, j)
     self.assertNotSameObj(i.Refs, j.Refs)
Esempio n. 29
0
def RichGenbankParser(handle,
                      info_excludes=None,
                      moltype=None,
                      skip_contigs=False):
    """Returns annotated sequences from GenBank formatted file.
    
    Arguments:
        - info_excludes: a series of fields to be excluded from the Info object
        - moltype: a MolType instance, such as PROTEIN, DNA. Default is ASCII.
        - skip_contigs: ignores records with no actual sequence data, typically
          a genomic contig."""
    info_excludes = info_excludes or []
    moltype = moltype or ASCII
    for rec in MinimalGenbankParser(handle):
        info = Info()
        # populate the Info object, excluding the sequence
        for label, value in rec.items():
            if label in info_excludes:
                continue
            info[label] = value

        if rec['mol_type'] == 'protein':  # which it doesn't for genbank
            moltype = PROTEIN
        elif rec['mol_type'] == 'DNA':
            moltype = DNA

        try:
            seq = moltype.makeSequence(rec['sequence'].upper(),
                                       Info=info,
                                       Name=rec['locus'])
        except KeyError:
            if not skip_contigs:
                if 'contig' in rec:
                    yield rec['locus'], rec['contig']
                elif 'WGS' in rec:
                    yield rec['locus'], rec['WGS']
                else:
                    yield rec['locus'], None
            continue

        for feature in rec['features']:
            spans = []
            reversed = None
            if feature['location'] == None or feature['type'] in ['source', \
                                                                'organism']:
                continue
            for location in feature['location']:
                (lo, hi) = (location.first() - 1, location.last())
                if location.Strand == -1:
                    (lo, hi) = (hi, lo)
                    assert reversed is not False
                    reversed = True
                else:
                    assert reversed is not True
                    reversed = False
                # ensure we don't put in a span that starts beyond the sequence
                if lo > len(seq):
                    continue
                # or that's longer than the sequence
                hi = [hi, len(seq)][hi > len(seq)]
                spans.append((lo, hi))
            if reversed:
                spans.reverse()

            for id_field in ['gene', 'note', 'product', 'clone']:
                if id_field in feature:
                    name = feature[id_field]
                    if not isinstance(name, basestring):
                        name = ' '.join(name)
                    break
            else:
                name = None
            seq.addAnnotation(Feature, feature['type'], name, spans)
        yield (rec['locus'], seq)