Esempio n. 1
0
def MinimalRdbParser(infile, strict=True):
    """Yield successive sequences as (headerLines, sequence) tuples.

    If strict is True (default) raises RecordError when 'seq' label is missing
    and if the record doesn't contain any sequences.
    """
    for rec in RdbFinder(infile):
        index = None
        for line in rec:
            if is_seq_label(line):
                index = rec.index(line) + 1  # index of first sequence line

        # if there is no line that starts with 'seq:' throw error or skip
        if not index:
            if strict:
                raise RecordError(
                    "Found Rdb record without seq label " + "line: %s" % rec[0]
                )
            else:
                continue

        headerLines = rec[:index]
        sequence = "".join(rec[index:-1])  # strip off the delimiter
        if sequence.endswith("*"):
            sequence = sequence[:-1]  # strip off '*'

        # if there are no sequences throw error or skip
        if not sequence:
            if strict:
                raise RecordError("Found Rdb record without sequences: %s" % rec[0])
            else:
                continue

        yield headerLines, sequence
Esempio n. 2
0
def MinimalFastaParser(
    infile, strict=True, label_to_name=str, finder=FastaFinder, label_characters=">"
):
    """Yields successive sequences from infile as (label, seq) tuples.

    If strict is True (default), raises RecordError when label or seq missing.
    """
    try:
        infile = open_(infile)
        close_at_end = True
    except (TypeError, AttributeError):
        close_at_end = False

    for rec in finder(infile):
        # first line must be a label line
        if not rec[0][0] in label_characters:
            if strict:
                raise RecordError("Found Fasta record without label line: %s" % rec)
            continue
        # record must have at least one sequence
        if len(rec) < 2:
            if strict:
                raise RecordError("Found label line without sequences: %s" % rec)
            else:
                continue

        label = rec[0][1:].strip()
        label = label_to_name(label)
        seq = "".join(rec[1:])

        yield label, seq

    if close_at_end:
        infile.close()
Esempio n. 3
0
def FastaParser(infile, seq_maker=None, info_maker=MinimalInfo, strict=True):
    """Yields successive sequences from infile as (name, sequence) tuples.

    Constructs the sequence using seq_maker(seq, info=Info(info_maker(label))).

    If strict is True (default), raises RecordError when label or seq missing.
    Also raises RecordError if seq_maker fails.

    It is info_maker's responsibility to raise the appropriate RecordError or
    FieldError on failure.

    Result of info_maker need not actually be an info object, but can just be
    a dict or other data that Info can use in its constructor.
    """
    if seq_maker is None:
        seq_maker = Sequence
    for label, seq in MinimalFastaParser(infile, strict=strict):
        if strict:
            # need to do error checking when constructing info and sequence
            try:
                name, info = info_maker(label)  # will raise exception if bad
                yield name, seq_maker(seq, name=name, info=info)
            except Exception as e:
                raise RecordError(
                    "Sequence construction failed on record with label %s" %
                    label)
        else:
            # not strict: just skip any record that raises an exception
            try:
                name, info = info_maker(label)
                yield (name, seq_maker(seq, name=name, info=info))
            except Exception as e:
                continue
Esempio n. 4
0
def LabelLineParser(record, splitter, strict=True):
    """Returns dict mapping list of data to labels, plus list with field order.

    Field order contains labels in order encountered in file.

    NOTE: doesn't care if lines are out of order in different blocks. This
    should never happen anyway, but it's possible that this behavior should
    be changed to tighten up validation.
    """
    labels = []
    result = {}
    for line in record:
        try:
            key, val = splitter(line.rstrip())
        except:
            if strict:
                raise RecordError(
                    "Failed to extract key and value from line %s" % line)
            else:
                continue  # just skip the line if not strict

        if key in result:
            result[key].append(val)
        else:
            result[key] = [val]
            labels.append(key)
    return result, labels
Esempio n. 5
0
def MinimalNexusAlignParser(align_path):
    """returns {label: seq, ...}"""
    if type(align_path) == str:
        infile = open_(align_path)
    else:
        infile = align_path

    isblock = re.compile(r"begin\s+(data|characters)").search
    inblock = False
    try:
        line = infile.readline()
    except AttributeError:
        # guessing it's a list of strings from a nexus file
        line = infile.pop(0)

    if not line.lower().startswith("#nexus"):
        raise ValueError("not a nexus file")

    block = []
    index = None
    for line in infile:
        if isblock(line.lower()):
            inblock = True
        elif inblock and line.lower().startswith("end;"):
            break
        elif inblock:
            line = line.strip()
            if line.lower().startswith("matrix"):
                index = len(block)
            elif not line.startswith(";"):
                block.append(line)

    if hasattr(infile, "close"):
        infile.close()

    if not block:
        raise ValueError("not found DATA or CHARACTER block")
    elif index is None:
        raise RecordError("malformed block, no 'matrix' line")

    block = block[index:]
    seqs = defaultdict(list)
    for line in block:
        if not line or (line.startswith("[") and line.endswith("]")):
            # blank or comment line
            continue

        line = line.split()
        seqs[line[0]].append("".join(line[1:]))

    for n, s in seqs.items():
        yield n, "".join(s)
Esempio n. 6
0
def NcbiFastaLabelParser(line):
    """Creates an Info object and populates it with the line contents.

    As of 11/12/03, all records in genpept.fsa and the human RefSeq fasta
    files were consistent with this format.
    """
    info = Info()
    try:
        ignore, gi, db, db_ref, description = list(map(strip, line.split("|", 4)))
    except ValueError:  # probably got wrong value
        raise RecordError("Unable to parse label line %s" % line)
    info.GI = gi
    info[NcbiLabels[db]] = db_ref
    info.Description = description
    return gi, info
Esempio n. 7
0
 def parser(lines):
     curr = []
     for l in lines:
         if constructor:
             line = constructor(l)
         else:
             line = l
         if ignore(line):
             continue
         curr.append(line)
         if len(curr) == num:
             yield curr
             curr = []
     if curr:
         raise RecordError("Non-blank lines not even multiple of %s" % num)
Esempio n. 8
0
    def parser(lines):
        curr = []
        for line in lines:
            if constructor:
                line = constructor(line)
            if ignore(line):
                continue

            curr.append(line)
            # if we find the label, return the previous record
            if is_tail_line(line):
                yield curr
                curr = []

        # don't forget to return the last record in the file
        if curr:
            if strict:
                raise RecordError("lines exist after the last tail_line "
                                  "or no tail_line at all")
            else:
                yield curr
Esempio n. 9
0
 def parser(lines):
     curr = []
     for line in lines:
         if constructor:
             line = constructor(line)
         # else:
         #    line = l
         # ignore blank lines
         if ignore(line):
             continue
         # if we find the delimiter, return the line; otherwise, keep it
         if line == delimiter:
             if keep_delimiter:
                 curr.append(line)
             yield curr
             curr = []
         else:
             curr.append(line)
     if curr:
         if strict:
             raise RecordError("Found additional data after records: %s" % (curr))
         else:
             yield curr
Esempio n. 10
0
def RdbParser(
    lines, SeqConstructor=RnaSequence, LabelConstructor=InfoMaker, strict=True
):
    """Yield sequences from the Rdb record.

    lines: a stream of Rdb records.
    SeqConstructor: constructor function to create the final sequence object
    LabelConstructor: function that creates Info dictionary from label lines
    strict: boolean, when True, an error is raised when one occurs, when False,
        the record is ignored when an error occurs.

    This function returns proper RnaSequence objects when possible. It strips
    out the secondary structure information, and it replaces 'o' by '?'. The
    original sequence is stored in the info dictionary under 'OriginalSeq'.
    If the original sequence is the desired end product, use MinimalRdbParser.
    """
    for header, sequence in MinimalRdbParser(lines, strict=strict):
        info = LabelConstructor(header)
        clean_seq = create_acceptable_sequence(sequence)
        # add original raw sequence to info
        info["OriginalSeq"] = sequence
        if strict:
            # need to do error checking while constructing info and sequence
            try:
                yield SeqConstructor(clean_seq, info=info)
            except AlphabetError:
                raise RecordError(
                    "Sequence construction failed on record with reference %s."
                    % (info.Refs)
                )
        else:
            # not strict: just skip any record that raises an exception
            try:
                yield SeqConstructor(clean_seq, info=info)
            except:
                continue
Esempio n. 11
0
def DndParser(lines, constructor=PhyloNode, unescape_name=False):
    """Returns tree from the Clustal .dnd file format, and anything equivalent.

    Tree is made up of cogent3.base.tree.PhyloNode objects, with branch lengths
    (by default, although you can pass in an alternative constructor
    explicitly).
    """
    if isinstance(lines, str):
        data = lines
    else:
        data = "".join(lines)
    # skip arb comment stuff if present: start at first paren
    paren_index = data.find("(")
    data = data[paren_index:]
    left_count = data.count("(")
    right_count = data.count(")")
    if left_count != right_count:
        raise RecordError("Found %s left parens but %s right parens." %
                          (left_count, right_count))

    tokens = DndTokenizer(data)
    curr_node = None
    state = "PreColon"
    state1 = "PreClosed"
    last_token = None
    for t in tokens:
        if t == ":":  # expecting branch length
            state = "PostColon"
            # prevent state reset
            last_token = t
            continue
        if t == ")" and (last_token == ","
                         or last_token == "("):  # node without name
            new_node = _new_child(curr_node, constructor)
            new_node.name = None
            curr_node = new_node.parent
            state1 = "PostClosed"
            last_token = t
            continue
        if t == ")":  # closing the current node
            curr_node = curr_node.parent
            state1 = "PostClosed"
            last_token = t
            continue
        if t == "(":  # opening a new node
            curr_node = _new_child(curr_node, constructor)
        elif t == ";":  # end of data
            last_token = t
            break
        # node without name
        elif t == "," and (last_token == "," or last_token == "("):
            new_node = _new_child(curr_node, constructor)
            new_node.name = None
            curr_node = new_node.parent
        elif t == ",":  # separator: next node adds to this node's parent
            curr_node = curr_node.parent
        elif state == "PreColon" and state1 == "PreClosed":  # data for the current node
            new_node = _new_child(curr_node, constructor)
            if unescape_name:
                if t.startswith("'") and t.endswith("'"):
                    while t.startswith("'") and t.endswith("'"):
                        t = t[1:-1]
                else:
                    if "_" in t:
                        t = t.replace("_", " ")
            new_node.name = t
            curr_node = new_node
        elif state == "PreColon" and state1 == "PostClosed":
            if unescape_name:
                while t.startswith("'") and t.endswith("'"):
                    t = t[1:-1]
            curr_node.name = t
        elif state == "PostColon":  # length data for the current node
            curr_node.length = float(t)
        else:  # can't think of a reason to get here
            raise RecordError("Incorrect PhyloNode state? %s" % t)
        state = "PreColon"  # get here for any non-colon token
        state1 = "PreClosed"
        last_token = t

    if curr_node is not None and curr_node.parent is not None:
        raise RecordError("Didn't get back to root of tree.")

    if curr_node is None:  # no data -- return empty node
        return constructor()
    return curr_node  # this should be the root of the tree
Esempio n. 12
0
 def dnastrict(x, **kwargs):
     try:
         return Dna(x, check=True, **kwargs)
     except Exception as e:
         raise RecordError("Could not convert sequence")
Esempio n. 13
0
 def dnastrict(x, **kwargs):
     try:
         return DnaSequence(x, **kwargs)
     except Exception:
         raise RecordError("Could not convert sequence")
Esempio n. 14
0
def MinimalPhylipParser(data, id_map=None, interleaved=True):
    """Yields successive sequences from data as (label, seq) tuples.

    **Need to implement id map.

    **NOTE if using phylip interleaved format, will cache entire file in
        memory before returning sequences. If phylip file not interleaved
        then will yield each successive sequence.

    data: sequence of lines in phylip format (an open file, list, etc)
    id_map: optional id mapping from external ids to phylip labels - not sure
        if we're going to implement this


    returns (id, sequence) tuples
    """

    seq_cache = {}
    interleaved_id_map = {}
    id_offset = 10
    curr_ct = -1

    for line in data:
        if curr_ct == -1:
            # get header info
            num_seqs, seq_len, interleaved = _get_header_info(line)

            if not num_seqs or not seq_len:
                return
            curr_ct += 1
            continue

        curr_id, curr_seq = _split_line(line, id_offset)

        # skip blank lines
        if not curr_id and not curr_seq:
            continue

        if not interleaved:
            if curr_id:
                if seq_cache:
                    yield seq_cache[0], "".join(seq_cache[1:])
                seq_cache = [curr_id, curr_seq]
            else:
                seq_cache.append(curr_seq)
        else:
            curr_id_ix = curr_ct % num_seqs

            if (curr_ct + 1) % num_seqs == 0:
                id_offset = 0

            if curr_id_ix not in interleaved_id_map:
                interleaved_id_map[curr_id_ix] = curr_id
                seq_cache[curr_id_ix] = []

            seq_cache[curr_id_ix].append(curr_seq)
        curr_ct += 1

    # return joined sequences if interleaved
    if interleaved:
        for curr_id_ix, seq_parts in list(seq_cache.items()):
            join_seq = "".join(seq_parts)

            if len(join_seq) != seq_len:
                raise RecordError(
                    "Length of sequence '%s' is not the same as in header "
                    "Found %d, Expected %d" %
                    (interleaved_id_map[curr_id_ix], len(join_seq), seq_len))

            yield interleaved_id_map[curr_id_ix], join_seq
    # return last seq if not interleaved
    else:
        if seq_cache:
            yield seq_cache[0], "".join(seq_cache[1:])
Esempio n. 15
0
def check_tree_info(tree_info):
    """makes sure that there is a tree section in the file"""
    if tree_info:
        pass
    else:
        raise RecordError("not a valid Nexus Tree File")