Beispiel #1
0
def CutgParser(infile, strict=True, constructor=CodonUsage):
    """Yields successive sequences from infile as CodonUsage objects.

    If strict is True (default), raises RecordError when label or seq missing.
    """
    if not strict:  #not much error checking needed: following makes logic clear
        for rec in CutgFinder(infile):
            try:
                label, counts = rec
                if not is_cutg_label(label):
                    continue
                info = InfoFromLabel(label)
                freqs = constructor(list(
                    zip(codon_order, list(map(int, counts.split())))),
                                    Info=info)
                yield freqs
            except:
                continue
    else:  #need to do more detailed error checking
        count = 0
        for rec in CutgFinder(infile):
            try:
                label, counts = rec
            except ValueError:  #can't have got any counts
                raise RecordError("Found label without sequences: %s" % rec)
            if not is_cutg_label(label):
                raise RecordError("Found CUTG record without label: %s" % rec)
            info = InfoFromLabel(label)
            try:
                freqs = constructor(list(
                    zip(codon_order, list(map(int, counts.split())))),
                                    Info=info)
            except NotImplementedError:
                raise RecordError("Unable to convert counts: %s" % counts)
            yield freqs
Beispiel #2
0
def MinimalFastaParser(infile, strict=True, \
    label_to_name=str, finder=FastaFinder, \
    is_label=None, label_characters='>'):
    """Yields successive sequences from infile as (label, seq) tuples.

    If strict is True (default), raises RecordError when label or seq missing.
    """
    
    for rec in finder(infile):
        #first line must be a label line
        if not rec[0][0] in label_characters:
            if strict:
                raise RecordError("Found Fasta record without label line: %s"%\
                    rec)
            else:
                continue
        #record must have at least one sequence
        if len(rec) < 2:
            if strict:
                raise RecordError("Found label line without sequences: %s" % \
                    rec)
            else:
                continue
            
        label = rec[0][1:].strip()
        label = label_to_name(label)
        seq = ''.join(rec[1:])

        yield label, seq
Beispiel #3
0
def MinimalRfamParser(infile, strict=True, seq_constructor=ChangedRnaSequence):
    """Yield successive sequences as (header, sequences, structure) tuples.
    
    header is a list of header lines
    sequences is an Alignment object. Sequences are objects keyed by the
        original labels in the database.
    structure is a WussStructure
    """
    for record in RfamFinder(infile):
        header = []
        sequences = []
        structure = []
        for line in record:
            if is_header_line(line):
                header.append(line.strip())
            elif is_seq_line(line):
                sequences.append(line)
            elif is_structure_line(line):
                structure.append(line)
            else:
                continue
        #sequence and structure are required.
        #for example when looking at the stockholm format of just one family
        if not sequences or not structure:
            if strict:
                error = 'Found record with missing element(s): '
                if not sequences:
                    error += 'sequences '
                if not structure:
                    error += 'structure '
                raise RecordError(error)
            else:
                continue
        #join all sequence parts together, construct label
        try:
            new_seqs = load_from_clustal(sequences,
                                         strict=strict,
                                         seq_constructor=seq_constructor)
            sequences = new_seqs
        except (DataError, RecordError) as e:
            if strict:
                raise RecordError(str(e))
            else:
                continue

        #construct the structure
        try:
            res = load_from_clustal(structure, strict=strict)
            assert len(res.NamedSeqs) == 1  #otherwise multiple keys
            structure = res.NamedSeqs['#=GC SS_cons']
        except (RecordError, KeyError, AssertionError) as e:
            if strict:
                raise RecordError("Can't parse structure of family: %s" %
                                  (str(header)))
            else:
                structure = None
        yield header, sequences, structure
Beispiel #4
0
def LabelLineParser(record, splitter, strict=True):
    """Returns dict mapping list of data to labels, plus list with field order.

    Field order contains labels in order encountered in file.

    NOTE: doesn't care if lines are out of order in different blocks. This 
    should never happen anyway, but it's possible that this behavior should
    be changed to tighten up validation.
    """
    labels = []
    result = {}
    for line in record:
        try:
            key, val = splitter(line.rstrip())
        except:
            if strict:
                raise RecordError("Failed to extract key and value from line %s" % line)
            else:
                continue    #just skip the line if not strict
            
        if key in result:
            result[key].append(val)
        else:
            result[key] = [val]
            labels.append(key)
    return result, labels
Beispiel #5
0
def FastaParser(infile,seq_maker=None,info_maker=MinimalInfo,strict=True):
    """Yields successive sequences from infile as (name, sequence) tuples.

    Constructs the sequence using seq_maker(seq, info=Info(info_maker(label))).

    If strict is True (default), raises RecordError when label or seq missing.
    Also raises RecordError if seq_maker fails.

    It is info_maker's responsibility to raise the appropriate RecordError or
    FieldError on failure.

    Result of info_maker need not actually be an info object, but can just be
    a dict or other data that Info can use in its constructor.
    """
    if seq_maker is None:
        seq_maker = Sequence
    for label, seq in MinimalFastaParser(infile, strict=strict):
        if strict:
            #need to do error checking when constructing info and sequence
            try:
                name, info = info_maker(label) #will raise exception if bad
                yield name, seq_maker(seq, Name=name, Info=info)
            except Exception as e:
                raise RecordError("Sequence construction failed on record with label %s" % label)
        else:
            #not strict: just skip any record that raises an exception
            try:
                name, info = info_maker(label)
                yield(name, seq_maker(seq, Name=name, Info=info))
            except Exception as e:
                continue
Beispiel #6
0
def GcToInfo(gc_lines, strict=True):
    """Returns a dict constructed from the GC lines.

    gc_lines is a list of lines that contain per column annotation.
    Fields that (should) occur only once are stored as a single value
    """
    # construct temporary dictionary containing all original information
    initial_info = defaultdict(list)
    for line in gc_lines:
        line = line.strip()
        if not line:
            continue
        try:
            init, feature, content = line.split(None, 2)
            if not init == '#=GC':
                raise RecordError
        except:
            if strict:
                raise RecordError("Failed to extract feature and content " +\
                    "information from line %s"%(line))
            else:
                continue

        initial_info[feature].append(content.strip())
    # transform initial dict into final one
    # throw away useless information; group information
    final_info = {}
    for key in list(initial_info.keys()):
        name = _gc_field_names.get(key, key)
        value = initial_info[key]
        final_info[name] = ''.join(value)

    return final_info
Beispiel #7
0
def RdbParser(lines, SeqConstructor=RnaSequence, LabelConstructor=InfoMaker, \
    strict=True):
    """Yield sequences from the Rdb record.

    lines: a stream of Rdb records.
    SeqConstructor: constructor function to create the final sequence object
    LabelConstructor: function that creates Info dictionary from label lines
    strict: boolean, when True, an error is raised when one occurs, when False,
        the record is ignored when an error occurs.

    This function returns proper RnaSequence objects when possible. It strips
    out the secondary structure information, and it replaces 'o' by '?'. The
    original sequence is stored in the info dictionary under 'OriginalSeq'.
    If the original sequence is the desired end product, use MinimalRdbParser.
    """
    for header, sequence in MinimalRdbParser(lines, strict=strict):
        info = LabelConstructor(header)
        clean_seq = create_acceptable_sequence(sequence)
        # add original raw sequence to info
        info['OriginalSeq'] = sequence
        if strict:
            #need to do error checking while constructing info and sequence
            try:
                yield SeqConstructor(clean_seq, Info=info)
            except AlphabetError:
                raise RecordError(\
                "Sequence construction failed on record with reference %s."\
                %(info.Refs))
        else:
            #not strict: just skip any record that raises an exception
            try:
                yield SeqConstructor(clean_seq, Info=info)
            except:
                continue
Beispiel #8
0
def StockholmParser(lines, seq_constructor=Rna, info_constructor_dict=\
    AllToInfo,struct_constructor=WussStructure,strict=True):
    """Yields (family_info, sequences, structure).

    Treats lines as a stream of Stockholm records.
    Family_info is the general information about the alignment.
    Sequences is an Alignment object. Each sequence has its own Info
        object with Genbank ID etc. Sequences are keyed by the original 
        label in the database.
    Structure is the consensus structure of the alignment, in Wuss format
    """
    for annotation, alignment, structure in MinimalStockholmParser\
        (lines,strict=strict,seq_constructor=seq_constructor):
        family_info = {}
        if strict:
            for k, v in list(annotation.items()):
                label_constructor = info_constructor_dict[k]
                try:
                    family_info[k] = label_constructor(v, strict=strict)
                except:
                    raise RecordError("Info construction failed on " +\
                        "record on the %s annotation"%(k))
            try:
                for seq in alignment.Seqs:
                    _process_seq(seq, strict)
                structure = struct_constructor(structure)
                alignment.Info.update(family_info)
                alignment.Info.update({'Struct': structure})
                yield alignment
            except Exception as e:
                raise RecordError("Sequence construction failed on " +\
                    "record with reference %s"%\
                        (family_info['GF'].get('AccessionNumber',None)))
        else:
            try:
                for k, v in list(annotation.items()):
                    label_constructor = info_constructor_dict[k]
                    family_info[k] = label_constructor(v, strict=strict)

                for seq in alignment.Seqs:
                    _process_seq(seq, strict)
                structure = struct_constructor(structure)
                alignment.Info.update(family_info)
                alignment.Info.update({'Struct': structure})
                yield alignment
            except Exception as e:
                continue
Beispiel #9
0
def NameToInfo(sequence, strict=True):
    """Returns an Info object constructed from the sequence Name

    sequence: Sequence object with a Name attribute

    The label will be split on Genbank acc. no. and sequence coordinates.
    The coordinates will be shifted one position, since in Python the first
        position is 0.
    """
    #adjust label
    label = sequence.Name
    try:
        gb, pos = label.split('/', 1)  #split genbank label and pos
        if not gb:
            gb = None
        if not pos:
            pos = None
    except:  #unable to split, so string doesn't contain '/'
        if strict:
            raise RecordError("Failed to extract genbank id and positions" +\
            " from label %s"%label)
        else:
            gb = None
            pos = None
    if pos:
        try:
            start, end = pos.split('-', 1)  #split start and end pos
        except:
            if strict:
                raise RecordError("Failed to extract genbank id and positions from label %s"\
                    %label)
            else:
                start = None
                end = None
    else:
        start = None
        end = None
    if start:
        # adjust start position to do the correct thing in python
        # see comment in docstring
        start = int(start) - 1
    if end:
        end = int(end)
    info = Info({'GenBank': gb, 'Start': start, 'End': end})
    return info
Beispiel #10
0
def RfamParser(lines, seq_constructor=ChangedRnaSequence, label_constructor=\
    HeaderToInfo,struct_constructor=WussStructure,strict=True,verbose=False):
    """Yields (family_info, sequences, structure).

    Treats lines as a stream of Rfam records.
    Family_info is the general information about the alignment.
    Sequences is an Alignment object. Each sequence has its own Info
        object with Genbank ID etc. Sequences are keyed by the original 
        label in the database.
    Structure is the consensus structure of the alignment, in Wuss format
    """
    for header, alignment, structure in MinimalRfamParser\
        (lines,strict=strict,seq_constructor=seq_constructor):
        if strict:
            try:
                family_info = label_constructor(header, strict=strict)
            except:
                raise RecordError("Info construction failed on " +\
                    "record with header %s"%header)
            try:
                for seq in alignment.Seqs:
                    _process_seq(seq, strict)
                structure = struct_constructor(structure)
                yield family_info, alignment, structure
            except Exception as e:
                raise RecordError("Sequence construction failed on " +\
                    "record with reference %s"%(family_info.Refs))
        else:
            try:
                family_info = label_constructor(header, strict=strict)
                for seq in alignment.Seqs:
                    _process_seq(seq, strict)
                structure = struct_constructor(structure)
                yield family_info, alignment, structure
            except Exception as e:
                if verbose:
                    print(Exception, e)
                continue
Beispiel #11
0
def NcbiFastaLabelParser(line):
    """Creates an Info object and populates it with the line contents.
    
    As of 11/12/03, all records in genpept.fsa and the human RefSeq fasta
    files were consistent with this format.
    """
    info = Info()
    try:
        ignore, gi, db, db_ref, description = list(map(strip, line.split('|', 4)))
    except ValueError:  #probably got wrong value
        raise RecordError("Unable to parse label line %s" % line)
    info.GI = gi
    info[NcbiLabels[db]] = db_ref
    info.Description = description
    return gi, info
Beispiel #12
0
 def parser(lines):
     curr = []
     for l in lines:
         if constructor:
             line = constructor(l)
         else:
             line = l
         if ignore(line):
             continue
         curr.append(line)
         if len(curr) == num:
             yield curr
             curr = []
     if curr:
         raise RecordError("Non-blank lines not even multiple of %s" % num)
Beispiel #13
0
def CutgSpeciesParser(infile, strict=True, constructor=CodonUsage):
    """Yields successive sequences from infile as CodonUsage objects.

    If strict is True (default), raises RecordError when label or seq missing.
    """
    if not strict:  #easier to see logic without detailed error handling
        for rec in CutgSpeciesFinder(infile):
            try:
                label, counts = rec
                if not is_cutg_species_label(label):
                    continue
                species, genes = species_label_splitter(label)
                info = Info({'Species': species, 'NumGenes': int(genes)})
                freqs = constructor(list(
                    zip(codon_order, list(map(int, counts.split())))),
                                    Info=info)
                yield freqs
            except:
                continue
    else:
        for rec in CutgSpeciesFinder(infile):
            try:
                label, counts = rec
            except ValueError:  #can't have got any counts
                raise RecordError("Found label without sequences: %s" % rec)

            if not is_cutg_species_label(label):
                raise RecordError("Found CUTG record without label: %s" % rec)
            species, genes = species_label_splitter(label)
            info = Info({'Species': species, 'NumGenes': int(genes)})
            try:
                d = list(zip(codon_order, list(map(int, counts.split()))))
                freqs = constructor(d, Info=info)
            except:
                raise RecordError("Unable to convert counts: %s" % counts)
            yield freqs
Beispiel #14
0
def verify_valid_fasta_format(input_fasta_fp):
    """ Tests fasta filepath to determine if valid format

    input_fasta_fp:  fasta filepath
    """

    fasta_f = open(input_fasta_fp, "U")

    try:
        for label, seq in MinimalFastaParser(fasta_f):
            continue
    except RecordError:
        raise RecordError("Input fasta file not valid fasta format.  Error " +
                          "found at %s label and %s sequence " % (label, seq))

    fasta_f.close()
Beispiel #15
0
def InfoFromLabel(line):
    """Takes a CUTG codon description line and returns an Info object.

    Raises RecordError if wrong number of fields etc.
    """
    try:
        raw_fields = line.split('\\')
        result = Info(
            dict(list(zip(field_order, list(map(strip, raw_fields[1:]))))))
        #extra processing for first field
        first = raw_fields[0]
        if '#' in first:
            locus, cds_num = list(map(strip, raw_fields[0].split('#')))
        else:
            locus, cds_num = first, '1'
        result['Locus'] = locus[1:]  #remove leading '>'
        result['CdsNumber'] = cds_num
        #additional processing for last field: mostly key="value" pairs
        description = result['Description']
        descrs = description.split('/')
        for d in descrs:
            if '=' in d:  #assume key-value pair
                key, val = list(map(strip, d.split('=',
                                                   1)))  #might be '=' in value
                #cut off leading and trailing " if present, but _not_ internal!
                if val.startswith('"'):
                    val = val[1:]
                if val.endswith('"'):
                    val = val[:-1]
                if key == 'db_xref':  #handle cross-refs specially
                    try:
                        key, val = val.split(':')
                    except ValueError:  #missing actual reference?
                        continue  #just skip the bad db records
                    try:
                        if result[key]:
                            result[key].append(val)
                        else:
                            result[key] = [val]
                    except (KeyError, TypeError):  #didn't recognize database
                        result[key] = val
                else:
                    #remember to convert the key to MixedCase naming convention
                    result[cfu(key)] = val
        return result
    except:
        raise RecordError("Failed to read label line:\n%s" % line)
Beispiel #16
0
def HeaderToInfo(header, strict=True):
    """Returns an Info object constructed from the header lines.

    Header is a list of lines that contain header information.
    Fields that can occur multiple times in a header are stored in a list.
    Fields that (should) occur only once are stored as a single value
    Comments are joined by ' ' to one field.
    Fields concerning the references are ignored, except for MedLine ID.
    """
    # construct temporary dictionary containing all original information
    initial_info = {}
    for line in header:
        line = line.strip()
        if not line:
            continue
        try:
            init, label, content = line.split(' ', 2)
            if not init == '#=GF' or len(label) != 2:
                raise RecordError
        except:
            if strict:
                raise RecordError("Failed to extract label and content " +\
                    "information from line %s"%(line))
            else:
                continue
        if label in ['BM', 'DR', 'RM', 'CC']:
            if label in initial_info:
                initial_info[label].append(content.strip())
            else:
                initial_info[label] = [content.strip()]
        else:
            initial_info[label] = content.strip()

    # transform initial dict into final one
    # throw away useless information; group information
    final_info = {}
    for key in list(initial_info.keys()):
        name = _field_names.get(key, key)
        if name == 'Comment':
            value = ' '.join(initial_info[key])
        else:
            value = initial_info[key]
        final_info[name] = value

    return Info(final_info)
Beispiel #17
0
def GfToInfo(gf_lines, strict=True):
    """Returns a dict constructed from the GF lines.

    gf_lines is a list of lines that contain per-file annotation.
    Fields that can occur multiple times in a header are stored in a list.
    Fields that (should) occur only once are stored as a single value
    Comments are joined by ' ' to one field.
    Fields concerning the references are ignored, except for MedLine ID.
    """
    # construct temporary dictionary containing all original information
    initial_info = {}
    for line in gf_lines:
        line = line.strip()
        if not line:
            continue
        try:
            init, feature, content = line.split(None, 2)
            if not init == '#=GF':
                raise RecordError
        except:
            if strict:
                raise RecordError("Failed to extract feature and content " +\
                    "information from line %s"%(line))
            else:
                continue
        if feature in ['BM', 'DR', 'RM', 'CC', 'FT']:
            if feature in initial_info:
                initial_info[feature].append(content.strip())
            else:
                initial_info[feature] = [content.strip()]
        else:
            initial_info[feature] = content.strip()

    # transform initial dict into final one
    # throw away useless information; group information
    final_info = {}
    for key in list(initial_info.keys()):
        name = _gf_field_names.get(key, key)
        if name == 'Comment':
            value = ' '.join(initial_info[key])
        else:
            value = initial_info[key]
        final_info[name] = value

    return final_info
Beispiel #18
0
def GsToInfo(gs_lines, strict=True):
    """Returns a dict constructed from the GS lines.

    gs_lines is a list of lines that contain per-sequence annotation.
    Fields that can occur multiple times in a header are stored in a list.
    Fields that (should) occur only once are stored as a single value
    """
    # construct temporary dictionary containing all original information
    initial_info = {}
    for line in gs_lines:
        line = line.strip()
        if not line:
            continue
        try:
            init, seqname, feature, content = line.split(None, 3)
            if not init == '#=GS':
                raise RecordError
        except:
            if strict:
                raise RecordError("Failed to extract feature and content " +\
                    "information from line %s"%(line))
            else:
                continue
        if feature in ['DE', 'DR', 'BP']:
            if feature in initial_info:
                initial_info[feature][seqname].append(content.strip())
            else:
                initial_info[feature] = {seqname: [content.strip()]}
        elif feature not in initial_info:
            initial_info[feature] = {seqname: content.strip()}
        else:
            initial_info[feature][seqname] = content.strip()

    # transform initial dict into final one
    # throw away useless information; group information
    final_info = {}
    for key in list(initial_info.keys()):
        name = _gs_field_names.get(key, key)
        value = initial_info[key]
        final_info[name] = value

    return final_info
Beispiel #19
0
def GrToInfo(gr_lines, strict=True):
    """Returns a dict constructed from the GR lines.

    gr_lines is a list of lines that contain per-sequence AND per-Column
    annotation.
    Fields that can occur multiple times in a header are stored in a list.
    Fields that (should) occur only once are stored as a single value
    """
    # construct temporary dictionary containing all original information
    initial_info = defaultdict(dict)
    for line in gr_lines:
        line = line.strip()
        if not line:
            continue
        try:
            init, seqname, feature, content = line.split(None, 3)
            if not init == '#=GR':
                raise RecordError
        except:
            if strict:
                raise RecordError("Failed to extract feature and content " +\
                    "information from line %s"%(line))
            else:
                continue
        if feature not in initial_info:
            initial_info[feature][seqname] = []
        elif seqname not in initial_info[feature]:
            initial_info[feature][seqname] = []
        initial_info[feature][seqname].append(content.strip())

    # transform initial dict into final one
    # throw away useless information; group information
    final_info = {}
    for feature in list(initial_info.keys()):
        name = _gr_field_names.get(feature, feature)
        value = initial_info[feature]
        for k, v in list(value.items()):
            value[k] = ''.join(v)
        final_info[name] = value

    return final_info
Beispiel #20
0
    def parser(lines):
        curr = []
        for line in lines:
            if constructor:
                line = constructor(line)
            if ignore(line):
                continue

            curr.append(line)
            #if we find the label, return the previous record
            if is_tail_line(line):
                yield curr
                curr = []

        #don't forget to return the last record in the file
        if curr:
            if strict:
                raise RecordError('lines exist after the last tail_line '
                                  'or no tail_line at all')
            else:
                yield curr
Beispiel #21
0
 def parser(lines):
     curr = []
     for line in lines:
         if constructor:
             line = constructor(line)
         #else:
         #    line = l
         #ignore blank lines
         if ignore(line):
             continue
         #if we find the delimiter, return the line; otherwise, keep it
         if line == delimiter:
             if keep_delimiter:
                 curr.append(line)
             yield curr
             curr = []
         else:
             curr.append(line)
     if curr:
         if strict:
             raise RecordError("Found additional data after records: %s"%\
                     (curr))
         else:
             yield curr
Beispiel #22
0
def check_tree_info(tree_info):
    """makes sure that there is a tree section in the file"""
    if tree_info:
        pass
    else:
        raise RecordError("not a valid Nexus Tree File")
Beispiel #23
0
 def dnastrict(x, **kwargs):
     try:
         return DnaSequence(x, **kwargs)
     except Exception:
         raise RecordError("Could not convert sequence")
Beispiel #24
0
def MinimalPhylipParser(data, id_map=None, interleaved=True):
    """Yields successive sequences from data as (label, seq) tuples.

    **Need to implement id map.

    **NOTE if using phylip interleaved format, will cache entire file in
        memory before returning sequences. If phylip file not interleaved
        then will yield each successive sequence.

    data: sequence of lines in phylip format (an open file, list, etc)
    id_map: optional id mapping from external ids to phylip labels - not sure
        if we're going to implement this


    returns (id, sequence) tuples
    """
    
    seq_cache = {}
    interleaved_id_map = {}
    id_offset = 10
    curr_ct = -1 

    for line in data:
        if curr_ct == -1:
            # get header info
            num_seqs, seq_len, interleaved = _get_header_info(line)
          
            if not num_seqs or not seq_len:
                return 
            curr_ct += 1
            continue

        curr_id, curr_seq = _split_line(line, id_offset)

        # skip blank lines
        if not curr_id and not curr_seq:
            continue

        if not interleaved:
            if curr_id:
                if seq_cache:
                    yield seq_cache[0], ''.join(seq_cache[1:])
                seq_cache = [curr_id, curr_seq]
            else:
                seq_cache.append(curr_seq)
        else:
            curr_id_ix = curr_ct % num_seqs

            if (curr_ct + 1) % num_seqs == 0:
                id_offset = 0

            if curr_id_ix not in interleaved_id_map:
                interleaved_id_map[curr_id_ix] = curr_id
                seq_cache[curr_id_ix] = []

            seq_cache[curr_id_ix].append(curr_seq)
        curr_ct += 1


    # return joined sequences if interleaved
    if interleaved:
        for curr_id_ix, seq_parts in seq_cache.items():
            join_seq = ''.join(seq_parts)

            if len(join_seq) != seq_len:
                raise RecordError(
                    "Length of sequence '%s' is not the same as in header "
                    "Found %d, Expected %d" % (
                    interleaved_id_map[curr_id_ix], len(join_seq), seq_len))

            yield interleaved_id_map[curr_id_ix], join_seq
    #return last seq if not interleaved
    else:
        if seq_cache:
            yield seq_cache[0], ''.join(seq_cache[1:])
Beispiel #25
0
 def dnastrict(x, **kwargs):
     try:
         return Dna(x, check=True, **kwargs)
     except Exception as e:
         raise RecordError("Could not convert sequence")
Beispiel #26
0
def DndParser(lines, constructor=PhyloNode, unescape_name=False):
    """Returns tree from the Clustal .dnd file format, and anything equivalent.

    Tree is made up of cogent.base.tree.PhyloNode objects, with branch lengths
    (by default, although you can pass in an alternative constructor
    explicitly).
    """
    if isinstance(lines, str):
        data = lines
    else:
        data = ''.join(lines)
    #skip arb comment stuff if present: start at first paren
    paren_index = data.find('(')
    data = data[paren_index:]
    left_count = data.count('(')
    right_count = data.count(')')
    if left_count != right_count:
        raise RecordError("Found %s left parens but %s right parens." % \
            (left_count, right_count))

    tokens = DndTokenizer(data)
    curr_node = None
    state = 'PreColon'
    state1 = 'PreClosed'
    last_token = None
    for t in tokens:
        if t == ':':  #expecting branch length
            state = 'PostColon'
            #prevent state reset
            last_token = t
            continue
        if t == ')' and (last_token == ','
                         or last_token == '('):  # node without name
            new_node = _new_child(curr_node, constructor)
            new_node.Name = None
            curr_node = new_node.Parent
            state1 = 'PostClosed'
            last_token = t
            continue
        if t == ')':  #closing the current node
            curr_node = curr_node.Parent
            state1 = 'PostClosed'
            last_token = t
            continue
        if t == '(':  #opening a new node
            curr_node = _new_child(curr_node, constructor)
        elif t == ';':  #end of data
            last_token = t
            break
        # node without name
        elif t == ',' and (last_token == ',' or last_token == '('):
            new_node = _new_child(curr_node, constructor)
            new_node.Name = None
            curr_node = new_node.Parent
        elif t == ',':  #separator: next node adds to this node's parent
            curr_node = curr_node.Parent
        elif state == 'PreColon' and state1 == 'PreClosed':  #data for the current node
            new_node = _new_child(curr_node, constructor)
            if unescape_name:
                if t.startswith("'") and t.endswith("'"):
                    while t.startswith("'") and t.endswith("'"):
                        t = t[1:-1]
                else:
                    if '_' in t:
                        t = t.replace('_', ' ')
            new_node.Name = t
            curr_node = new_node
        elif state == 'PreColon' and state1 == 'PostClosed':
            if unescape_name:
                while t.startswith("'") and t.endswith("'"):
                    t = t[1:-1]
            curr_node.Name = t
        elif state == 'PostColon':  #length data for the current node
            curr_node.Length = float(t)
        else:  #can't think of a reason to get here
            raise RecordError("Incorrect PhyloNode state? %s" % t)
        state = 'PreColon'  #get here for any non-colon token
        state1 = 'PreClosed'
        last_token = t

    if curr_node is not None and curr_node.Parent is not None:
        raise RecordError("Didn't get back to root of tree.")

    if curr_node is None:  #no data -- return empty node
        return constructor()
    return curr_node  #this should be the root of the tree
Beispiel #27
0
def MinimalStockholmParser(infile, strict=True, seq_constructor=Rna):
    """Yield successive records as (gf, gc, gs, gr, sequences, structure).
    
    gf is a list of GF lines
    gc is a list of GC lines
    gs is a list of GS lines
    gr is a list of GR lines
    sequences is an Alignment object. Sequences are Rna objects keyed by the
        original labels in the database.
    structure is a WussStructure
    """
    for record in StockholmFinder(infile):
        gf = []
        gc = []
        gs = []
        gr = []
        sequences = []
        structure = []
        for line in record:
            if is_gf_line(line):
                gf.append(line.strip())
            elif is_gc_line(line):
                gc.append(line.strip())
                if is_structure_line(line):
                    structure.append(line)
            elif is_gs_line(line):
                gs.append(line.strip())
            elif is_gr_line(line):
                gr.append(line.strip())
            elif is_seq_line(line):
                sequences.append(line)

            else:
                continue
        #sequence and structure are required.
        #for example when looking at the stockholm format of just one family
        if not sequences:
            if strict:
                error = 'Found record with missing element(s): '
                if not sequences:
                    error += 'sequences'
                raise RecordError(error)
            else:
                continue
        #join all sequence parts together, construct label
        try:
            new_seqs = load_from_clustal(sequences,
                                         strict=strict,
                                         seq_constructor=seq_constructor)
            sequences = new_seqs
        except (DataError, RecordError) as e:
            if strict:
                raise RecordError(str(e))
            else:
                continue

        #construct the structure
        if structure:
            try:
                res = load_from_clustal(structure, strict=strict, gap_char='.')
                assert len(res.NamedSeqs) == 1  #otherwise multiple keys
                structure = res.NamedSeqs['#=GC SS_cons']
            except (RecordError, KeyError, AssertionError) as e:
                if strict:
                    raise RecordError("Can't parse structure of family")
                structure = None
        yield {'GF': gf, 'GC': gc, 'GS': gs, 'GR': gr}, sequences, structure