예제 #1
0
파일: Record.py 프로젝트: kaspermunch/sap
    def to_generic(self, alphabet):
        """Retrieve generic alignment object for the given alignment.

        Instead of the tuples, this returns a MultipleSeqAlignment object
        from SAP.Bio.Align, through which you can manipulate and query
        the object.

        alphabet is the specified alphabet for the sequences in the code (for
        example IUPAC.IUPACProtein).

        Thanks to James Casbon for the code.
        """
        #TODO - Switch to new Bio.Align.MultipleSeqAlignment class?
        seq_parts = []
        seq_names = []
        parse_number = 0
        n = 0
        for name, start, seq, end in self.alignment:
            if name == 'QUERY':  # QUERY is the first in each alignment block
                parse_number += 1
                n = 0

            if parse_number == 1:  # create on first_parse, append on all others
                seq_parts.append(seq)
                seq_names.append(name)
            else:
                seq_parts[n] += seq
                n += 1

        generic = MultipleSeqAlignment([], alphabet)
        for (name, seq) in zip(seq_names, seq_parts):
            generic.append(SeqRecord(Seq(seq, alphabet), name))

        return generic
예제 #2
0
파일: Record.py 프로젝트: cbirdlab/sap
    def to_generic(self, alphabet):
        """Retrieve generic alignment object for the given alignment.

        Instead of the tuples, this returns a MultipleSeqAlignment object
        from SAP.Bio.Align, through which you can manipulate and query
        the object.

        alphabet is the specified alphabet for the sequences in the code (for
        example IUPAC.IUPACProtein).

        Thanks to James Casbon for the code.
        """
        #TODO - Switch to new Bio.Align.MultipleSeqAlignment class?
        seq_parts = []
        seq_names = []
        parse_number = 0
        n = 0
        for name, start, seq, end in self.alignment:
            if name == 'QUERY':  # QUERY is the first in each alignment block
                parse_number += 1
                n = 0

            if parse_number == 1:  # create on first_parse, append on all others
                seq_parts.append(seq)
                seq_names.append(name)
            else:
                seq_parts[n] += seq
                n += 1

        generic = MultipleSeqAlignment([], alphabet)
        for (name, seq) in zip(seq_names, seq_parts):
            generic.append(SeqRecord(Seq(seq, alphabet), name))

        return generic
예제 #3
0
def mult_align(sum_dict, align_dict):
    """Returns a biopython multiple alignment instance (MultipleSeqAlignment)"""
    mult_align_dict = {}
    for j in align_dict.abs(1).pos_align_dict:
        mult_align_dict[j] = ''

    for i in range(1, len(align_dict)+1):
        # loop on positions
        for j in align_dict.abs(i).pos_align_dict:
            # loop within a position
            mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa
    alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein)
    fssp_align = MultipleSeqAlignment([], alphabet=alpha)
    for i in sorted(mult_align_dict):
        fssp_align.append(SeqRecord(Seq(mult_align_dict[i], alpha),
                                    sum_dict[i].pdb2+sum_dict[i].chain2))
    return fssp_align
예제 #4
0
def mult_align(sum_dict, align_dict):
    """Returns a biopython multiple alignment instance (MultipleSeqAlignment)"""
    mult_align_dict = {}
    for j in align_dict.abs(1).pos_align_dict:
        mult_align_dict[j] = ''

    for i in range(1, len(align_dict) + 1):
        # loop on positions
        for j in align_dict.abs(i).pos_align_dict:
            # loop within a position
            mult_align_dict[j] += align_dict.abs(i).pos_align_dict[j].aa
    alpha = Alphabet.Gapped(Alphabet.IUPAC.extended_protein)
    fssp_align = MultipleSeqAlignment([], alphabet=alpha)
    for i in sorted(mult_align_dict):
        fssp_align.append(
            SeqRecord(Seq(mult_align_dict[i], alpha),
                      sum_dict[i].pdb2 + sum_dict[i].chain2))
    return fssp_align
예제 #5
0
파일: FastaIO.py 프로젝트: kaspermunch/sap
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r"
                             % (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect", None)
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                #Quick hack until I can work out how -, * and / characters
                #and the apparent mix of aa and bp coordinates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError as err:
            print("Darn... amino acids vs nucleotide coordinates?")
            print(tool)
            print(query_seq)
            print(query_tags)
            print("%s %i" % (q, len(q)))
            print(match_seq)
            print(match_tags)
            print("%s %i" % (m, len(m)))
            print(handle.name)
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(Seq(q, alphabet),
                           id=query_id,
                           name="query",
                           description=query_descr,
                           annotations={"original_length": int(query_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(Seq(m, alphabet),
                           id=match_id,
                           name="match",
                           description=match_descr,
                           annotations={"original_length": int(match_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment
예제 #6
0
    def build_hsp():
        if not query_tags and not match_tags:
            raise ValueError("No data for query %r, match %r" %
                             (query_id, match_id))
        assert query_tags, query_tags
        assert match_tags, match_tags
        evalue = align_tags.get("fa_expect", None)
        q = "?"  # Just for printing len(q) in debug below
        m = "?"  # Just for printing len(m) in debug below
        tool = global_tags.get("tool", "").upper()
        try:
            q = _extract_alignment_region(query_seq, query_tags)
            if tool in ["TFASTX"] and len(match_seq) == len(q):
                m = match_seq
                #Quick hack until I can work out how -, * and / characters
                #and the apparent mix of aa and bp coordinates works.
            else:
                m = _extract_alignment_region(match_seq, match_tags)
            assert len(q) == len(m)
        except AssertionError as err:
            print("Darn... amino acids vs nucleotide coordinates?")
            print(tool)
            print(query_seq)
            print(query_tags)
            print("%s %i" % (q, len(q)))
            print(match_seq)
            print(match_tags)
            print("%s %i" % (m, len(m)))
            print(handle.name)
            raise err

        assert alphabet is not None
        alignment = MultipleSeqAlignment([], alphabet)

        #TODO - Introduce an annotated alignment class?
        #For now, store the annotation a new private property:
        alignment._annotations = {}

        #Want to record both the query header tags, and the alignment tags.
        for key, value in header_tags.items():
            alignment._annotations[key] = value
        for key, value in align_tags.items():
            alignment._annotations[key] = value

        #Query
        #=====
        record = SeqRecord(
            Seq(q, alphabet),
            id=query_id,
            name="query",
            description=query_descr,
            annotations={"original_length": int(query_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(query_tags["al_start"])
        record._al_stop = int(query_tags["al_stop"])
        alignment.append(record)

        #TODO - What if a specific alphabet has been requested?
        #TODO - Use an IUPAC alphabet?
        #TODO - Can FASTA output RNA?
        if alphabet == single_letter_alphabet and "sq_type" in query_tags:
            if query_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif query_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in q:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        #Match
        #=====
        record = SeqRecord(
            Seq(m, alphabet),
            id=match_id,
            name="match",
            description=match_descr,
            annotations={"original_length": int(match_tags["sq_len"])})
        #TODO - handle start/end coordinates properly. Short term hack for now:
        record._al_start = int(match_tags["al_start"])
        record._al_stop = int(match_tags["al_stop"])
        alignment.append(record)

        #This is still a very crude way of dealing with the alphabet:
        if alphabet == single_letter_alphabet and "sq_type" in match_tags:
            if match_tags["sq_type"] == "D":
                record.seq.alphabet = generic_dna
            elif match_tags["sq_type"] == "p":
                record.seq.alphabet = generic_protein
        if "-" in m:
            if not hasattr(record.seq.alphabet, "gap_char"):
                record.seq.alphabet = Gapped(record.seq.alphabet, "-")

        return alignment