예제 #1
0
파일: Generic.py 프로젝트: cbirdlab/sap
    def __format__(self, format_spec):
        """Returns the alignment as a string in the specified file format.

        This method supports the python format() function added in
        Python 2.6/3.0.  The format_spec should be a lower case
        string supported by Bio.AlignIO as an output file format.
        See also the alignment's format() method."""
        if format_spec:
            from SAP.Bio._py3k import StringIO
            from SAP.Bio import AlignIO
            handle = StringIO()
            AlignIO.write([self], handle, format_spec)
            return handle.getvalue()
        else:
            #Follow python convention and default to using __str__
            return str(self)
예제 #2
0
파일: __init__.py 프로젝트: kaspermunch/sap
def write(sequences, handle, format):
    """Write complete set of sequences to a file.

     - sequences - A list (or iterator) of SeqRecord objects, or (if using
                   Biopython 1.54 or later) a single SeqRecord.
     - handle    - File handle object to write to, or filename as string
                   (note older versions of Biopython only took a handle).
     - format    - lower case string describing the file format to write.

    You should close the handle after calling this function.

    Returns the number of records written (as an integer).
    """
    from SAP.Bio import AlignIO

    #Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(sequences, SeqRecord):
        #This raised an exception in order version of Biopython
        sequences = [sequences]

    if format in _BinaryFormats:
        mode = 'wb'
    else:
        mode = 'w'

    with as_handle(handle, mode) as fp:
        #Map the file format to a writer class
        if format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(sequences)
        elif format in AlignIO._FormatToWriter:
            #Try and turn all the records into a single alignment,
            #and write that using Bio.AlignIO
            alignment = MultipleSeqAlignment(sequences)
            alignment_count = AlignIO.write([alignment], fp, format)
            assert alignment_count == 1, \
                "Internal error - the underlying writer " \
                " should have returned 1, not %s" % repr(alignment_count)
            count = len(alignment)
            del alignment_count, alignment
        elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
            raise ValueError("Reading format '%s' is supported, but not writing"
                             % format)
        else:
            raise ValueError("Unknown format '%s'" % format)

        assert isinstance(count, int), "Internal error - the underlying %s " \
            "writer should have returned the record count, not %s" \
            % (format, repr(count))

    return count
예제 #3
0
파일: __init__.py 프로젝트: cbirdlab/sap
def write(sequences, handle, format):
    """Write complete set of sequences to a file.

     - sequences - A list (or iterator) of SeqRecord objects, or (if using
                   Biopython 1.54 or later) a single SeqRecord.
     - handle    - File handle object to write to, or filename as string
                   (note older versions of Biopython only took a handle).
     - format    - lower case string describing the file format to write.

    You should close the handle after calling this function.

    Returns the number of records written (as an integer).
    """
    from SAP.Bio import AlignIO

    #Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)

    if isinstance(sequences, SeqRecord):
        #This raised an exception in order version of Biopython
        sequences = [sequences]

    if format in _BinaryFormats:
        mode = 'wb'
    else:
        mode = 'w'

    with as_handle(handle, mode) as fp:
        #Map the file format to a writer class
        if format in _FormatToWriter:
            writer_class = _FormatToWriter[format]
            count = writer_class(fp).write_file(sequences)
        elif format in AlignIO._FormatToWriter:
            #Try and turn all the records into a single alignment,
            #and write that using Bio.AlignIO
            alignment = MultipleSeqAlignment(sequences)
            alignment_count = AlignIO.write([alignment], fp, format)
            assert alignment_count == 1, \
                "Internal error - the underlying writer " \
                " should have returned 1, not %s" % repr(alignment_count)
            count = len(alignment)
            del alignment_count, alignment
        elif format in _FormatToIterator or format in AlignIO._FormatToIterator:
            raise ValueError(
                "Reading format '%s' is supported, but not writing" % format)
        else:
            raise ValueError("Unknown format '%s'" % format)

        assert isinstance(count, int), "Internal error - the underlying %s " \
            "writer should have returned the record count, not %s" \
            % (format, repr(count))

    return count
예제 #4
0
if __name__=="__main__":
    import sys
    from SAP.Bio.Alphabet import generic_protein
    from SAP.Bio import AlignIO
    from SAP.Bio.PDB import PDBParser

    if len(sys.argv) != 4:
        print("Expects three arguments,")
        print(" - FASTA alignment filename (expect two sequences)")
        print(" - PDB file one")
        print(" - PDB file two")
        sys.exit()

    # The alignment
    fa=AlignIO.read(open(sys.argv[1]), "fasta", generic_protein)

    pdb_file1=sys.argv[2]
    pdb_file2=sys.argv[3]

    # The structures
    p=PDBParser()
    s1=p.get_structure('1', pdb_file1)
    p=PDBParser()
    s2=p.get_structure('2', pdb_file2)

    # Get the models
    m1=s1[0]
    m2=s2[0]

    al=StructureAlignment(fa, m1, m2)
예제 #5
0
파일: __init__.py 프로젝트: kaspermunch/sap
def parse(handle, format, alphabet=None):
    r"""Turns a sequence file into an iterator returning SeqRecords.

     - handle   - handle to the file, or the filename as a string
                  (note older versions of Biopython only took a handle).
     - format   - lower case string describing the file format.
     - alphabet - optional Alphabet object, useful when the sequence type
                  cannot be automatically inferred from the file itself
                  (e.g. format="fasta" or "tab")

    Typical usage, opening a file to read in, and looping over the record(s):

    >>> from SAP.Bio import SeqIO
    >>> filename = "Fasta/sweetpea.nu"
    >>> for record in SeqIO.parse(filename, "fasta"):
    ...    print("ID %s" % record.id)
    ...    print("Sequence length %i" % len(record))
    ...    print("Sequence alphabet %s" % record.seq.alphabet)
    ID gi|3176602|gb|U78617.1|LOU78617
    Sequence length 309
    Sequence alphabet SingleLetterAlphabet()

    For file formats like FASTA where the alphabet cannot be determined, it
    may be useful to specify the alphabet explicitly:

    >>> from SAP.Bio import SeqIO
    >>> from SAP.Bio.Alphabet import generic_dna
    >>> filename = "Fasta/sweetpea.nu"
    >>> for record in SeqIO.parse(filename, "fasta", generic_dna):
    ...    print("ID %s" % record.id)
    ...    print("Sequence length %i" % len(record))
    ...    print("Sequence alphabet %s" % record.seq.alphabet)
    ID gi|3176602|gb|U78617.1|LOU78617
    Sequence length 309
    Sequence alphabet DNAAlphabet()

    If you have a string 'data' containing the file contents, you must
    first turn this into a handle in order to parse it:

    >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n"
    >>> from SAP.Bio import SeqIO
    >>> try:
    ...     from StringIO import StringIO # Python 2
    ... except ImportError:
    ...     from io import StringIO # Python 3
    ...
    >>> for record in SeqIO.parse(StringIO(data), "fasta"):
    ...     print("%s %s" % (record.id, record.seq))
    Alpha ACCGGATGTA
    Beta AGGCTCGGTTA

    Use the Bio.SeqIO.read(...) function when you expect a single record
    only.
    """
    #NOTE - The above docstring has some raw \n characters needed
    #for the StringIO example, hense the whole docstring is in raw
    #string mode (see the leading r before the opening quote).
    from SAP.Bio import AlignIO

    #Hack for SFF, will need to make this more general in future
    if format in _BinaryFormats:
        mode = 'rb'
    else:
        mode = 'rU'

    #Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    if alphabet is not None and not (isinstance(alphabet, Alphabet) or
                                     isinstance(alphabet, AlphabetEncoder)):
        raise ValueError("Invalid alphabet, %s" % repr(alphabet))

    with as_handle(handle, mode) as fp:
        #Map the file format to a sequence iterator:
        if format in _FormatToIterator:
            iterator_generator = _FormatToIterator[format]
            if alphabet is None:
                i = iterator_generator(fp)
            else:
                try:
                    i = iterator_generator(fp, alphabet=alphabet)
                except TypeError:
                    i = _force_alphabet(iterator_generator(fp), alphabet)
        elif format in AlignIO._FormatToIterator:
            #Use Bio.AlignIO to read in the alignments
            i = (r for alignment in AlignIO.parse(fp, format,
                                                  alphabet=alphabet)
                 for r in alignment)
        else:
            raise ValueError("Unknown format '%s'" % format)
        #This imposes some overhead... wait until we drop Python 2.4 to fix it
        for r in i:
            yield r
예제 #6
0
    for pos in sorted(summary_info.ic_vector):
        fout.write("%d %s %.3f\n" % (pos, rep_sequence[pos],
                   summary_info.ic_vector[pos]))

if __name__ == "__main__":
    print("Quick test")
    from SAP.Bio import AlignIO
    from SAP.Bio.Align.Generic import Alignment

    filename = "../../Tests/GFF/multi.fna"
    format = "fasta"
    expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25},
                                   FreqTable.FREQ,
                                   IUPAC.unambiguous_dna)

    alignment = AlignIO.read(open(filename), format)
    for record in alignment:
        print(record.seq)
    print("="*alignment.get_alignment_length())

    summary = SummaryInfo(alignment)
    consensus = summary.dumb_consensus(ambiguous="N")
    print(consensus)
    consensus = summary.gap_consensus(ambiguous="N")
    print(consensus)
    print("")
    print(summary.pos_specific_score_matrix(chars_to_ignore=['-'],
                                            axis_seq=consensus))
    print("")
    #Have a generic alphabet, without a declared gap char, so must tell
    #provide the frequencies and chars to ignore explicitly.
예제 #7
0
if __name__ == "__main__":
    import sys
    from SAP.Bio.Alphabet import generic_protein
    from SAP.Bio import AlignIO
    from SAP.Bio.PDB import PDBParser

    if len(sys.argv) != 4:
        print("Expects three arguments,")
        print(" - FASTA alignment filename (expect two sequences)")
        print(" - PDB file one")
        print(" - PDB file two")
        sys.exit()

    # The alignment
    fa = AlignIO.read(open(sys.argv[1]), "fasta", generic_protein)

    pdb_file1 = sys.argv[2]
    pdb_file2 = sys.argv[3]

    # The structures
    p = PDBParser()
    s1 = p.get_structure('1', pdb_file1)
    p = PDBParser()
    s2 = p.get_structure('2', pdb_file2)

    # Get the models
    m1 = s1[0]
    m2 = s2[0]

    al = StructureAlignment(fa, m1, m2)
예제 #8
0
파일: AlignInfo.py 프로젝트: cbirdlab/sap
if __name__ == "__main__":
    print("Quick test")
    from SAP.Bio import AlignIO
    from SAP.Bio.Align.Generic import Alignment

    filename = "../../Tests/GFF/multi.fna"
    format = "fasta"
    expected = FreqTable.FreqTable({
        "A": 0.25,
        "G": 0.25,
        "T": 0.25,
        "C": 0.25
    }, FreqTable.FREQ, IUPAC.unambiguous_dna)

    alignment = AlignIO.read(open(filename), format)
    for record in alignment:
        print(record.seq)
    print("=" * alignment.get_alignment_length())

    summary = SummaryInfo(alignment)
    consensus = summary.dumb_consensus(ambiguous="N")
    print(consensus)
    consensus = summary.gap_consensus(ambiguous="N")
    print(consensus)
    print("")
    print(
        summary.pos_specific_score_matrix(chars_to_ignore=['-'],
                                          axis_seq=consensus))
    print("")
    #Have a generic alphabet, without a declared gap char, so must tell
예제 #9
0
파일: __init__.py 프로젝트: cbirdlab/sap
def parse(handle, format, alphabet=None):
    r"""Turns a sequence file into an iterator returning SeqRecords.

     - handle   - handle to the file, or the filename as a string
                  (note older versions of Biopython only took a handle).
     - format   - lower case string describing the file format.
     - alphabet - optional Alphabet object, useful when the sequence type
                  cannot be automatically inferred from the file itself
                  (e.g. format="fasta" or "tab")

    Typical usage, opening a file to read in, and looping over the record(s):

    >>> from SAP.Bio import SeqIO
    >>> filename = "Fasta/sweetpea.nu"
    >>> for record in SeqIO.parse(filename, "fasta"):
    ...    print("ID %s" % record.id)
    ...    print("Sequence length %i" % len(record))
    ...    print("Sequence alphabet %s" % record.seq.alphabet)
    ID gi|3176602|gb|U78617.1|LOU78617
    Sequence length 309
    Sequence alphabet SingleLetterAlphabet()

    For file formats like FASTA where the alphabet cannot be determined, it
    may be useful to specify the alphabet explicitly:

    >>> from SAP.Bio import SeqIO
    >>> from SAP.Bio.Alphabet import generic_dna
    >>> filename = "Fasta/sweetpea.nu"
    >>> for record in SeqIO.parse(filename, "fasta", generic_dna):
    ...    print("ID %s" % record.id)
    ...    print("Sequence length %i" % len(record))
    ...    print("Sequence alphabet %s" % record.seq.alphabet)
    ID gi|3176602|gb|U78617.1|LOU78617
    Sequence length 309
    Sequence alphabet DNAAlphabet()

    If you have a string 'data' containing the file contents, you must
    first turn this into a handle in order to parse it:

    >>> data = ">Alpha\nACCGGATGTA\n>Beta\nAGGCTCGGTTA\n"
    >>> from SAP.Bio import SeqIO
    >>> try:
    ...     from StringIO import StringIO # Python 2
    ... except ImportError:
    ...     from io import StringIO # Python 3
    ...
    >>> for record in SeqIO.parse(StringIO(data), "fasta"):
    ...     print("%s %s" % (record.id, record.seq))
    Alpha ACCGGATGTA
    Beta AGGCTCGGTTA

    Use the Bio.SeqIO.read(...) function when you expect a single record
    only.
    """
    #NOTE - The above docstring has some raw \n characters needed
    #for the StringIO example, hense the whole docstring is in raw
    #string mode (see the leading r before the opening quote).
    from SAP.Bio import AlignIO

    #Hack for SFF, will need to make this more general in future
    if format in _BinaryFormats:
        mode = 'rb'
    else:
        mode = 'rU'

    #Try and give helpful error messages:
    if not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if not format:
        raise ValueError("Format required (lower case string)")
    if format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    if alphabet is not None and not (isinstance(alphabet, Alphabet)
                                     or isinstance(alphabet, AlphabetEncoder)):
        raise ValueError("Invalid alphabet, %s" % repr(alphabet))

    with as_handle(handle, mode) as fp:
        #Map the file format to a sequence iterator:
        if format in _FormatToIterator:
            iterator_generator = _FormatToIterator[format]
            if alphabet is None:
                i = iterator_generator(fp)
            else:
                try:
                    i = iterator_generator(fp, alphabet=alphabet)
                except TypeError:
                    i = _force_alphabet(iterator_generator(fp), alphabet)
        elif format in AlignIO._FormatToIterator:
            #Use Bio.AlignIO to read in the alignments
            i = (r
                 for alignment in AlignIO.parse(fp, format, alphabet=alphabet)
                 for r in alignment)
        else:
            raise ValueError("Unknown format '%s'" % format)
        #This imposes some overhead... wait until we drop Python 2.4 to fix it
        for r in i:
            yield r