Example #1
0
    def common_ancestor(self, targets, *more_targets):
        """Most recent common ancestor (clade) of all the given targets.

        Edge cases:
        - If no target is given, returns self.root
        - If 1 target is given, returns the target
        - If any target is not found in this tree, raises a ValueError
        """
        paths = [self.get_path(t)
                 for t in _combine_args(targets, *more_targets)]
        # Validation -- otherwise izip throws a spooky error below
        for p, t in zip(paths, targets):
            if p is None:
                raise ValueError("target %s is not in this tree" % repr(t))
        mrca = self.root
        for level in zip(*paths):
            ref = level[0]
            for other in level[1:]:
                if ref is not other:
                    break
            else:
                mrca = ref
            if ref is not mrca:
                break
        return mrca
Example #2
0
    def common_ancestor(self, targets, *more_targets):
        """Most recent common ancestor (clade) of all the given targets.

        Edge cases:
        - If no target is given, returns self.root
        - If 1 target is given, returns the target
        - If any target is not found in this tree, raises a ValueError
        """
        paths = [
            self.get_path(t) for t in _combine_args(targets, *more_targets)
        ]
        # Validation -- otherwise izip throws a spooky error below
        for p, t in zip(paths, targets):
            if p is None:
                raise ValueError("target %s is not in this tree" % repr(t))
        mrca = self.root
        for level in zip(*paths):
            ref = level[0]
            for other in level[1:]:
                if ref is not other:
                    break
            else:
                mrca = ref
            if ref is not mrca:
                break
        return mrca
Example #3
0
def _get_inter_coords(coords, strand=1):
    """From the given pairs of coordinates, returns a list of pairs
    covering the intervening ranges."""
    # adapted from Python's itertools guide
    # if strand is -1, adjust coords to the ends and starts are chained
    if strand == -1:
        sorted_coords = [(max(a, b), min(a, b)) for a, b in coords]
        inter_coords = list(chain(*sorted_coords))[1:-1]
        return list(zip(inter_coords[1::2], inter_coords[::2]))
    else:
        inter_coords = list(chain(*coords))[1:-1]
        return list(zip(inter_coords[::2], inter_coords[1::2]))
Example #4
0
def _get_inter_coords(coords, strand=1):
    """From the given pairs of coordinates, returns a list of pairs
    covering the intervening ranges."""
    # adapted from Python's itertools guide
    # if strand is -1, adjust coords to the ends and starts are chained
    if strand == -1:
        sorted_coords = [(max(a, b), min(a, b)) for a, b in coords]
        inter_coords = list(chain(*sorted_coords))[1:-1]
        return list(zip(inter_coords[1::2], inter_coords[::2]))
    else:
        inter_coords = list(chain(*coords))[1:-1]
        return list(zip(inter_coords[::2], inter_coords[1::2]))
Example #5
0
def _reorient_starts(starts, blksizes, seqlen, strand):
    """Reorients block starts into the opposite strand's coordinates.

    Arguments:
    starts -- List of integers, start coordinates.
    start -- Integer, 'Q start' or 'T start' column
    blksizes -- List of integers, block sizes.
    seqlen -- Integer of total sequence length.
    strand -- Integer denoting sequence strand.

    """
    assert len(starts) == len(blksizes), \
            "Unequal start coordinates and block sizes list (%r vs %r)" \
            % (len(starts), len(blksizes))
    # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html
    # no need to reorient if it's already the positive strand
    if strand >= 0:
        return starts
    else:
        # the plus-oriented coordinate is calculated by this:
        # plus_coord = length - minus_coord - block_size
        return [
            seqlen - start - blksize
            for start, blksize in zip(starts, blksizes)
        ]
Example #6
0
def _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens):
    """Returns the length of introns between fragments."""
    # set opposite type, for setting introns
    opp_type = 'hit' if seq_type == 'query' else 'query'
    # list of flags to denote if an intron follows a block
    # it reads e.g. this line:
    # "ATGTT{TT}  >>>> Target Intron 1 >>>>  {G}TGTGTGTACATT"
    # and sets the opposing sequence type's intron (since this
    # line is present on the opposite sequence type line)
    has_intron_after = ['Intron' in x[seq_type] for x in inter_blocks]
    assert len(has_intron_after) == len(raw_inter_lens)
    # create list containing coord adjustments incorporating
    # intron lengths
    inter_lens = []
    for flag, parsed_len in zip(has_intron_after, raw_inter_lens):
        if flag:
            # joint introns
            if all(parsed_len[:2]):
                # intron len is [0] if opp_type is query, otherwise it's [1]
                intron_len = int(parsed_len[0]) if opp_type == 'query' \
                        else int(parsed_len[1])
            # single hit/query introns
            elif parsed_len[2]:
                intron_len = int(parsed_len[2])
            else:
                raise ValueError("Unexpected intron parsing "
                                 "result: %r" % parsed_len)
        else:
            intron_len = 0

        inter_lens.append(intron_len)

    return inter_lens
Example #7
0
    def randomized(cls, taxa, branch_length=1.0, branch_stdev=None):
        """Create a randomized bifurcating tree given a list of taxa.

        :param taxa: Either an integer specifying the number of taxa to create
            (automatically named taxon#), or an iterable of taxon names, as
            strings.

        :returns: a tree of the same type as this class.
        """
        if isinstance(taxa, int):
            taxa = ['taxon%s' % (i+1) for i in range(taxa)]
        elif hasattr(taxa, '__iter__'):
            taxa = list(taxa)
        else:
            raise TypeError("taxa argument must be integer (# taxa) or "
                            "iterable of taxon names.")
        rtree = cls()
        terminals = [rtree.root]
        while len(terminals) < len(taxa):
            newsplit = random.choice(terminals)
            newsplit.split(branch_length=branch_length)
            newterms = newsplit.clades
            if branch_stdev:
                # Add some noise to the branch lengths
                for nt in newterms:
                    nt.branch_length = max(0,
                            random.gauss(branch_length, branch_stdev))
            terminals.remove(newsplit)
            terminals.extend(newterms)
        # Distribute taxon labels randomly
        random.shuffle(taxa)
        for node, name in zip(terminals, taxa):
            node.name = name
        return rtree
Example #8
0
    def randomized(cls, taxa, branch_length=1.0, branch_stdev=None):
        """Create a randomized bifurcating tree given a list of taxa.

        :param taxa: Either an integer specifying the number of taxa to create
            (automatically named taxon#), or an iterable of taxon names, as
            strings.

        :returns: a tree of the same type as this class.
        """
        if isinstance(taxa, int):
            taxa = ['taxon%s' % (i + 1) for i in range(taxa)]
        elif hasattr(taxa, '__iter__'):
            taxa = list(taxa)
        else:
            raise TypeError("taxa argument must be integer (# taxa) or "
                            "iterable of taxon names.")
        rtree = cls()
        terminals = [rtree.root]
        while len(terminals) < len(taxa):
            newsplit = random.choice(terminals)
            newsplit.split(branch_length=branch_length)
            newterms = newsplit.clades
            if branch_stdev:
                # Add some noise to the branch lengths
                for nt in newterms:
                    nt.branch_length = max(
                        0, random.gauss(branch_length, branch_stdev))
            terminals.remove(newsplit)
            terminals.extend(newterms)
        # Distribute taxon labels randomly
        random.shuffle(taxa)
        for node, name in zip(terminals, taxa):
            node.name = name
        return rtree
Example #9
0
def _comp_intron_lens(seq_type, inter_blocks, raw_inter_lens):
    """Returns the length of introns between fragments."""
    # set opposite type, for setting introns
    opp_type = 'hit' if seq_type == 'query' else 'query'
    # list of flags to denote if an intron follows a block
    # it reads e.g. this line:
    # "ATGTT{TT}  >>>> Target Intron 1 >>>>  {G}TGTGTGTACATT"
    # and sets the opposing sequence type's intron (since this
    # line is present on the opposite sequence type line)
    has_intron_after = ['Intron' in x[seq_type] for x in
            inter_blocks]
    assert len(has_intron_after) == len(raw_inter_lens)
    # create list containing coord adjustments incorporating
    # intron lengths
    inter_lens = []
    for flag, parsed_len in zip(has_intron_after, raw_inter_lens):
        if flag:
            # joint introns
            if all(parsed_len[:2]):
                # intron len is [0] if opp_type is query, otherwise it's [1]
                intron_len = int(parsed_len[0]) if opp_type == 'query' \
                        else int(parsed_len[1])
            # single hit/query introns
            elif parsed_len[2]:
                intron_len = int(parsed_len[2])
            else:
                raise ValueError("Unexpected intron parsing "
                        "result: %r" % parsed_len)
        else:
            intron_len = 0

        inter_lens.append(intron_len)

    return inter_lens
Example #10
0
File: Ace.py Project: cbirdlab/sap
 def __init__(self, line=None):
     self.chromat_file = ''
     self.phd_file = ''
     self.time = ''
     self.chem = ''
     self.dye = ''
     self.template = ''
     self.direction = ''
     if line:
         tags = ['CHROMAT_FILE', 'PHD_FILE', 'TIME', 'CHEM', 'DYE', 'TEMPLATE', 'DIRECTION']
         poss = [line.find(x) for x in tags]
         tagpos = dict(zip(poss, tags))
         if -1 in tagpos:
             del tagpos[-1]
         ps = sorted(tagpos) # the keys
         for (p1, p2) in zip(ps, ps[1:]+[len(line)+1]):
             setattr(self, tagpos[p1].lower(), line[p1+len(tagpos[p1])+1:p2].strip())
Example #11
0
def _flip_codons(codon_seq, target_seq):
    """Flips the codon characters from one seq to another."""
    a, b = '', ''
    for char1, char2 in zip(codon_seq, target_seq):
        # no need to do anything if the codon seq line has nothing
        if char1 == ' ':
            a += char1
            b += char2
        else:
            a += char2
            b += char1

    return a, b
Example #12
0
def _gaf10iterator(handle):
    for inline in handle:
        if inline[0] == '!':
            continue
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
            continue
        inrec[3] = inrec[3].split('|') #Qualifier
        inrec[5] = inrec[5].split('|') # DB:reference(s)
        inrec[7] = inrec[7].split('|') # With || From
        inrec[10] = inrec[10].split('|') # Synonym
        inrec[12] = inrec[12].split('|') # Taxon
        yield dict(zip(GAF10FIELDS, inrec))
Example #13
0
def _flip_codons(codon_seq, target_seq):
    """Flips the codon characters from one seq to another."""
    a, b = '', ''
    for char1, char2 in zip(codon_seq, target_seq):
        # no need to do anything if the codon seq line has nothing
        if char1 == ' ':
            a += char1
            b += char2
        else:
            a += char2
            b += char1

    return a, b
Example #14
0
def _gaf10iterator(handle):
    for inline in handle:
        if inline[0] == '!':
            continue
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
            continue
        inrec[3] = inrec[3].split('|')  #Qualifier
        inrec[5] = inrec[5].split('|')  # DB:reference(s)
        inrec[7] = inrec[7].split('|')  # With || From
        inrec[10] = inrec[10].split('|')  # Synonym
        inrec[12] = inrec[12].split('|')  # Taxon
        yield dict(zip(GAF10FIELDS, inrec))
Example #15
0
def _gpi10iterator(handle):
    """Read GPI 1.0 format files (PRIVATE).

    This iterator is used to read a gp_information.goa_uniprot
    file which is in the GPI 1.0 format.
    """
    for inline in handle:
        if inline[0] == '!':
            continue
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
            continue
        inrec[5] = inrec[5].split('|')  # DB_Object_Synonym(s)
        inrec[8] = inrec[8].split('|')  # Annotation_Target_Set
        yield dict(zip(GPI10FIELDS, inrec))
Example #16
0
def _gpi10iterator(handle):
    """Read GPI 1.0 format files (PRIVATE).

    This iterator is used to read a gp_information.goa_uniprot
    file which is in the GPI 1.0 format.
    """
    for inline in handle:
        if inline[0] == '!':
            continue
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
            continue
        inrec[5] = inrec[5].split('|') # DB_Object_Synonym(s)
        inrec[8] = inrec[8].split('|') # Annotation_Target_Set
        yield dict(zip(GPI10FIELDS, inrec))
Example #17
0
def _get_coords(filename):
    alb = file(filename)

    start_line = None
    end_line = None

    for line in alb:
        if line.startswith("["):
            if not start_line:
                start_line = line # rstrip not needed
            else:
                end_line = line

    if end_line is None: # sequence is too short
        return [(0, 0), (0, 0)]

    return list(zip(*map(_alb_line2coords, [start_line, end_line]))) # returns [(start0, end0), (start1, end1)]
Example #18
0
def _gpi11iterator(handle):
    """Read GPI 1.0 format files (PRIVATE).

    This iterator is used to read a gp_information.goa_uniprot
    file which is in the GPI 1.0 format.
    """
    for inline in handle:
        if inline[0] == '!':
            continue
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
            continue
        inrec[2] = inrec[2].split('|')  # DB_Object_Name
        inrec[3] = inrec[3].split('|')  # DB_Object_Synonym(s)
        inrec[7] = inrec[7].split('|')  # DB_Xref(s)
        inrec[8] = inrec[8].split('|')  # Properties
        yield dict(zip(GPI11FIELDS, inrec))
Example #19
0
def _gpi11iterator(handle):
    """Read GPI 1.0 format files (PRIVATE).

    This iterator is used to read a gp_information.goa_uniprot
    file which is in the GPI 1.0 format.
    """
    for inline in handle:
        if inline[0] == '!':
            continue
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
            continue
        inrec[2] = inrec[2].split('|') # DB_Object_Name
        inrec[3] = inrec[3].split('|') # DB_Object_Synonym(s)
        inrec[7] = inrec[7].split('|') # DB_Xref(s)
        inrec[8] = inrec[8].split('|') # Properties
        yield dict(zip(GPI11FIELDS, inrec))
Example #20
0
def _get_coords(filename):
    alb = file(filename)

    start_line = None
    end_line = None

    for line in alb:
        if line.startswith("["):
            if not start_line:
                start_line = line  # rstrip not needed
            else:
                end_line = line

    if end_line is None:  # sequence is too short
        return [(0, 0), (0, 0)]

    return list(zip(*map(_alb_line2coords, [start_line, end_line]))
                )  # returns [(start0, end0), (start1, end1)]
Example #21
0
def _gpa11iterator(handle):
    """Read GPA 1.1 format files (PRIVATE).

    This iterator is used to read a gp_association.goa_uniprot
    file which is in the GPA 1.1 format. Do not call directly. Rather
    use the gpa_iterator function
    """
    for inline in handle:
        if inline[0] == '!':
            continue
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
            continue
        inrec[2] = inrec[2].split('|') # Qualifier
        inrec[4] = inrec[4].split('|') # DB:Reference(s)
        inrec[6] = inrec[6].split('|') # With
        inrec[10] = inrec[10].split('|') # Annotation extension
        yield dict(zip(GPA11FIELDS, inrec))
Example #22
0
def _gpa11iterator(handle):
    """Read GPA 1.1 format files (PRIVATE).

    This iterator is used to read a gp_association.goa_uniprot
    file which is in the GPA 1.1 format. Do not call directly. Rather
    use the gpa_iterator function
    """
    for inline in handle:
        if inline[0] == '!':
            continue
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
            continue
        inrec[2] = inrec[2].split('|')  # Qualifier
        inrec[4] = inrec[4].split('|')  # DB:Reference(s)
        inrec[6] = inrec[6].split('|')  # With
        inrec[10] = inrec[10].split('|')  # Annotation extension
        yield dict(zip(GPA11FIELDS, inrec))
Example #23
0
def _gaf20byproteiniterator(handle):
    cur_id = None
    id_rec_list = []
    for inline in handle:
        if inline[0] == '!':
            continue
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
            continue
        inrec[3] = inrec[3].split('|') #Qualifier
        inrec[5] = inrec[5].split('|') # DB:reference(s)
        inrec[7] = inrec[7].split('|') # With || From
        inrec[10] = inrec[10].split('|') # Synonym
        inrec[12] = inrec[12].split('|') # Taxon
        cur_rec = dict(zip(GAF20FIELDS, inrec))
        if cur_rec['DB_Object_ID'] != cur_id and cur_id:
            ret_list = copy.copy(id_rec_list)
            id_rec_list = [cur_rec]
            cur_id = cur_rec['DB_Object_ID']
            yield ret_list
        else:
            cur_id = cur_rec['DB_Object_ID']
            id_rec_list.append(cur_rec)
Example #24
0
def _reorient_starts(starts, blksizes, seqlen, strand):
    """Reorients block starts into the opposite strand's coordinates.

    Arguments:
    starts -- List of integers, start coordinates.
    start -- Integer, 'Q start' or 'T start' column
    blksizes -- List of integers, block sizes.
    seqlen -- Integer of total sequence length.
    strand -- Integer denoting sequence strand.

    """
    assert len(starts) == len(blksizes), \
            "Unequal start coordinates and block sizes list (%r vs %r)" \
            % (len(starts), len(blksizes))
    # see: http://genome.ucsc.edu/goldenPath/help/blatSpec.html
    # no need to reorient if it's already the positive strand
    if strand >= 0:
        return starts
    else:
        # the plus-oriented coordinate is calculated by this:
        # plus_coord = length - minus_coord - block_size
        return [seqlen - start - blksize for
                start, blksize in zip(starts, blksizes)]
Example #25
0
def _gaf20byproteiniterator(handle):
    cur_id = None
    id_rec_list = []
    for inline in handle:
        if inline[0] == '!':
            continue
        inrec = inline.rstrip('\n').split('\t')
        if len(inrec) == 1:
            continue
        inrec[3] = inrec[3].split('|')  #Qualifier
        inrec[5] = inrec[5].split('|')  # DB:reference(s)
        inrec[7] = inrec[7].split('|')  # With || From
        inrec[10] = inrec[10].split('|')  # Synonym
        inrec[12] = inrec[12].split('|')  # Taxon
        cur_rec = dict(zip(GAF20FIELDS, inrec))
        if cur_rec['DB_Object_ID'] != cur_id and cur_id:
            ret_list = copy.copy(id_rec_list)
            id_rec_list = [cur_rec]
            cur_id = cur_rec['DB_Object_ID']
            yield ret_list
        else:
            cur_id = cur_rec['DB_Object_ID']
            id_rec_list.append(cur_rec)
Example #26
0
def AbiIterator(handle, alphabet=None, trim=False):
    """Iterator for the Abi file format.
    """
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.ProteinAlphabet):
            raise ValueError(
                "Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet),
                      Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, 'mode'):
        if set('rb') != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        raise StopIteration
    if marker != _as_bytes('ABIF'):
        raise IOError('File should start ABIF, not %r' % marker)

    # dirty hack for handling time information
    times = {'RUND1': '', 'RUND2': '', 'RUNT1': '', 'RUNT2': '', }

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT,
                           handle.read(struct.calcsize(_HEADFMT)))

    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        # stop iteration if all desired tags have been extracted
        # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3,
        # and seq, qual, id
        # todo

        key = tag_name + str(tag_number)

        # PBAS2 is base-called sequence
        if key == 'PBAS2':
            seq = tag_data
            ambigs = 'KYWMRS'
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == 'PCON2':
            qual = [ord(val) for val in tag_data]
        # SMPL1 is sample id entered before sequencing run
        elif key == 'SMPL1':
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            # extract sequence annotation as defined in _EXTRACT
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1'])
    annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2'])

    # use the file name as SeqRecord.name if available
    try:
        file_name = basename(handle.name).replace('.ab1', '')
    except:
        file_name = ""

    record = SeqRecord(Seq(seq, alphabet),
                       id=sample_id, name=file_name,
                       description='',
                       annotations=annot,
                       letter_annotations={'phred_quality': qual})

    if not trim:
        yield record
    else:
        yield _abi_trim(record)
Example #27
0
def AbiIterator(handle, alphabet=None, trim=False):
    """Iterator for the Abi file format.
    """
    # raise exception is alphabet is not dna
    if alphabet is not None:
        if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.ProteinAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold proteins.")
        if isinstance(Alphabet._get_base_alphabet(alphabet), Alphabet.RNAAlphabet):
            raise ValueError("Invalid alphabet, ABI files do not hold RNA.")

    # raise exception if handle mode is not 'rb'
    if hasattr(handle, "mode"):
        if set("rb") != set(handle.mode.lower()):
            raise ValueError("ABI files has to be opened in 'rb' mode.")

    # check if input file is a valid Abi file
    handle.seek(0)
    marker = handle.read(4)
    if not marker:
        # handle empty file gracefully
        raise StopIteration
    if marker != _as_bytes("ABIF"):
        raise IOError("File should start ABIF, not %r" % marker)

    # dirty hack for handling time information
    times = {"RUND1": "", "RUND2": "", "RUNT1": "", "RUNT2": ""}

    # initialize annotations
    annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT)))

    # parse header and extract data from directories
    header = struct.unpack(_HEADFMT, handle.read(struct.calcsize(_HEADFMT)))

    for tag_name, tag_number, tag_data in _abi_parse_header(header, handle):
        # stop iteration if all desired tags have been extracted
        # 4 tags from _EXTRACT + 2 time tags from _SPCTAGS - 3,
        # and seq, qual, id
        # todo

        key = tag_name + str(tag_number)

        # PBAS2 is base-called sequence
        if key == "PBAS2":
            seq = tag_data
            ambigs = "KYWMRS"
            if alphabet is None:
                if set(seq).intersection(ambigs):
                    alphabet = ambiguous_dna
                else:
                    alphabet = unambiguous_dna
        # PCON2 is quality values of base-called sequence
        elif key == "PCON2":
            qual = [ord(val) for val in tag_data]
        # SMPL1 is sample id entered before sequencing run
        elif key == "SMPL1":
            sample_id = tag_data
        elif key in times:
            times[key] = tag_data
        else:
            # extract sequence annotation as defined in _EXTRACT
            if key in _EXTRACT:
                annot[_EXTRACT[key]] = tag_data

    # set time annotations
    annot["run_start"] = "%s %s" % (times["RUND1"], times["RUNT1"])
    annot["run_finish"] = "%s %s" % (times["RUND2"], times["RUNT2"])

    # use the file name as SeqRecord.name if available
    try:
        file_name = basename(handle.name).replace(".ab1", "")
    except:
        file_name = ""

    record = SeqRecord(
        Seq(seq, alphabet),
        id=sample_id,
        name=file_name,
        description="",
        annotations=annot,
        letter_annotations={"phred_quality": qual},
    )

    if not trim:
        yield record
    else:
        yield _abi_trim(record)
Example #28
0
def _create_hsp(hid, qid, psl):
    # protein flag
    is_protein = _is_protein(psl)
    # strand
    #if query is protein, strand is 0
    if is_protein:
        qstrand = 0
    else:
        qstrand = 1 if psl['strand'][0] == '+' else -1
    # try to get hit strand, if it exists
    try:
        hstrand = 1 if psl['strand'][1] == '+' else -1
    except IndexError:
        hstrand = 1  # hit strand defaults to plus

    # query block starts
    qstarts = _reorient_starts(psl['qstarts'],
            psl['blocksizes'], psl['qsize'], qstrand)
    # hit block starts
    if len(psl['strand']) == 2:
        hstarts = _reorient_starts(psl['tstarts'],
                psl['blocksizes'], psl['tsize'], hstrand)
    else:
        hstarts = psl['tstarts']
    # set query and hit coords
    # this assumes each block has no gaps (which seems to be the case)
    assert len(qstarts) == len(hstarts) == len(psl['blocksizes'])
    query_range_all = list(zip(qstarts, [x + y for x, y in
                                         zip(qstarts, psl['blocksizes'])]))
    hit_range_all = list(zip(hstarts, [x + y for x, y in
                                       zip(hstarts, psl['blocksizes'])]))
    # check length of sequences and coordinates, all must match
    if 'tseqs' in psl and 'qseqs' in psl:
        assert len(psl['tseqs']) == len(psl['qseqs']) == \
                len(query_range_all) == len(hit_range_all)
    else:
        assert len(query_range_all) == len(hit_range_all)

    frags = []
    # iterating over query_range_all, but hit_range_all works just as well
    for idx, qcoords in enumerate(query_range_all):
        hseqlist = psl.get('tseqs')
        hseq = '' if not hseqlist else hseqlist[idx]
        qseqlist = psl.get('qseqs')
        qseq = '' if not qseqlist else qseqlist[idx]
        frag = HSPFragment(hid, qid, hit=hseq, query=qseq)
        # set alphabet
        frag.alphabet = generic_dna
        # set coordinates
        frag.query_start = qcoords[0]
        frag.query_end = qcoords[1]
        frag.hit_start = hit_range_all[idx][0]
        frag.hit_end = hit_range_all[idx][1]
        # and strands
        frag.query_strand = qstrand
        frag.hit_strand = hstrand
        frags.append(frag)

    # create hsp object
    hsp = HSP(frags)
    # check if start and end are set correctly
    assert hsp.query_start == psl['qstart']
    assert hsp.query_end == psl['qend']
    assert hsp.hit_start == psl['tstart']
    assert hsp.hit_end == psl['tend']
    # and check block spans as well
    assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes']
    # set its attributes
    hsp.match_num = psl['matches']
    hsp.mismatch_num = psl['mismatches']
    hsp.match_rep_num = psl['repmatches']
    hsp.n_num = psl['ncount']
    hsp.query_gapopen_num = psl['qnuminsert']
    hsp.query_gap_num = psl['qbaseinsert']
    hsp.hit_gapopen_num = psl['tnuminsert']
    hsp.hit_gap_num = psl['tbaseinsert']

    hsp.ident_num = psl['matches'] + psl['repmatches']
    hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert']
    hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert']
    hsp.query_is_protein = is_protein
    hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1
    hsp.score = _calc_score(psl, is_protein)
    # helper flag, for writing
    hsp._has_hit_strand = len(psl['strand']) == 2

    return hsp
Example #29
0
def _create_hsp(hid, qid, psl):
    # protein flag
    is_protein = _is_protein(psl)
    # strand
    #if query is protein, strand is 0
    if is_protein:
        qstrand = 0
    else:
        qstrand = 1 if psl['strand'][0] == '+' else -1
    # try to get hit strand, if it exists
    try:
        hstrand = 1 if psl['strand'][1] == '+' else -1
    except IndexError:
        hstrand = 1  # hit strand defaults to plus

    # query block starts
    qstarts = _reorient_starts(psl['qstarts'], psl['blocksizes'], psl['qsize'],
                               qstrand)
    # hit block starts
    if len(psl['strand']) == 2:
        hstarts = _reorient_starts(psl['tstarts'], psl['blocksizes'],
                                   psl['tsize'], hstrand)
    else:
        hstarts = psl['tstarts']
    # set query and hit coords
    # this assumes each block has no gaps (which seems to be the case)
    assert len(qstarts) == len(hstarts) == len(psl['blocksizes'])
    query_range_all = list(
        zip(qstarts, [x + y for x, y in zip(qstarts, psl['blocksizes'])]))
    hit_range_all = list(
        zip(hstarts, [x + y for x, y in zip(hstarts, psl['blocksizes'])]))
    # check length of sequences and coordinates, all must match
    if 'tseqs' in psl and 'qseqs' in psl:
        assert len(psl['tseqs']) == len(psl['qseqs']) == \
                len(query_range_all) == len(hit_range_all)
    else:
        assert len(query_range_all) == len(hit_range_all)

    frags = []
    # iterating over query_range_all, but hit_range_all works just as well
    for idx, qcoords in enumerate(query_range_all):
        hseqlist = psl.get('tseqs')
        hseq = '' if not hseqlist else hseqlist[idx]
        qseqlist = psl.get('qseqs')
        qseq = '' if not qseqlist else qseqlist[idx]
        frag = HSPFragment(hid, qid, hit=hseq, query=qseq)
        # set alphabet
        frag.alphabet = generic_dna
        # set coordinates
        frag.query_start = qcoords[0]
        frag.query_end = qcoords[1]
        frag.hit_start = hit_range_all[idx][0]
        frag.hit_end = hit_range_all[idx][1]
        # and strands
        frag.query_strand = qstrand
        frag.hit_strand = hstrand
        frags.append(frag)

    # create hsp object
    hsp = HSP(frags)
    # check if start and end are set correctly
    assert hsp.query_start == psl['qstart']
    assert hsp.query_end == psl['qend']
    assert hsp.hit_start == psl['tstart']
    assert hsp.hit_end == psl['tend']
    # and check block spans as well
    assert hsp.query_span_all == hsp.hit_span_all == psl['blocksizes']
    # set its attributes
    hsp.match_num = psl['matches']
    hsp.mismatch_num = psl['mismatches']
    hsp.match_rep_num = psl['repmatches']
    hsp.n_num = psl['ncount']
    hsp.query_gapopen_num = psl['qnuminsert']
    hsp.query_gap_num = psl['qbaseinsert']
    hsp.hit_gapopen_num = psl['tnuminsert']
    hsp.hit_gap_num = psl['tbaseinsert']

    hsp.ident_num = psl['matches'] + psl['repmatches']
    hsp.gapopen_num = psl['qnuminsert'] + psl['tnuminsert']
    hsp.gap_num = psl['qbaseinsert'] + psl['tbaseinsert']
    hsp.query_is_protein = is_protein
    hsp.ident_pct = 100.0 - _calc_millibad(psl, is_protein) * 0.1
    hsp.score = _calc_score(psl, is_protein)
    # helper flag, for writing
    hsp._has_hit_strand = len(psl['strand']) == 2

    return hsp
Example #30
0
def parse_vulgar_comp(hsp, vulgar_comp):
    """Parses the vulgar components present in the hsp dictionary."""
    # containers for block coordinates
    qstarts, qends, hstarts, hends = \
            [hsp['query_start']], [], [hsp['hit_start']], []
    # containers for split codons
    hsp['query_split_codons'], hsp['hit_split_codons'] = [], []
    # containers for ner blocks
    hsp['query_ner_ranges'], hsp['hit_ner_ranges'] = [], []
    # sentinels for tracking query and hit positions
    qpos, hpos = hsp['query_start'], hsp['hit_start']
    # multiplier for determining sentinel movement
    qmove = 1 if hsp['query_strand'] >= 0 else -1
    hmove = 1 if hsp['hit_strand'] >= 0 else -1

    vcomps = re.findall(_RE_VCOMP, vulgar_comp)
    for idx, match in enumerate(vcomps):
        label, qstep, hstep = match[0], int(match[1]), int(match[2])
        # check for label, must be recognized
        assert label in 'MCGF53INS', "Unexpected vulgar label: %r" % label
        # match, codon, or gaps
        if label in 'MCGS':
            # if the previous comp is not an MCGS block, it's the
            # start of a new block
            if vcomps[idx-1][0] not in 'MCGS':
                qstarts.append(qpos)
                hstarts.append(hpos)
        # other labels
        # store the values in the hsp dict as a tuple of (start, stop)
        # we're not doing anything if the label is in '53IN', as these
        # basically tell us what the inter-block coordinates are and
        # inter-block coordinates are automatically calculated by
        # and HSP property
        if label == 'S':
            # get start and stop from parsed values
            qstart, hstart = qpos, hpos
            qend = qstart + qstep * qmove
            hend = hstart + hstep * hmove
            # adjust the start-stop ranges
            sqstart, sqend = min(qstart, qend), max(qstart, qend)
            shstart, shend = min(hstart, hend), max(hstart, hend)
            # split codons
            # XXX: is it possible to have a frameshift that introduces
            # a codon split? If so, this may need a different treatment..
            hsp['query_split_codons'].append((sqstart, sqend))
            hsp['hit_split_codons'].append((shstart, shend))

        # move sentinels accordingly
        qpos += qstep * qmove
        hpos += hstep * hmove

        # append to ends if the next comp is not an MCGS block or
        # if it's the last comp
        if idx == len(vcomps)-1 or \
                (label in 'MCGS' and vcomps[idx+1][0] not in 'MCGS'):
                qends.append(qpos)
                hends.append(hpos)

    # adjust coordinates
    for seq_type in ('query_', 'hit_'):
        strand = hsp[seq_type + 'strand']
        # switch coordinates if strand is < 0
        if strand < 0:
            # switch the starts and ends
            hsp[seq_type + 'start'], hsp[seq_type + 'end'] = \
                    hsp[seq_type + 'end'], hsp[seq_type + 'start']
            if seq_type == 'query_':
                qstarts, qends = qends, qstarts
            else:
                hstarts, hends = hends, hstarts

    # set start and end ranges
    hsp['query_ranges'] = list(zip(qstarts, qends))
    hsp['hit_ranges'] = list(zip(hstarts, hends))
    return hsp