예제 #1
0
파일: bam2msa.py 프로젝트: spond/BioExt
def test_align():
    ''' Ensure that sequence that ends with a '-' will not cause an error '''

    dir_path = os.path.dirname(os.path.realpath(__file__))

    ## Load reference sequence
    seqpath = os.path.join(dir_path, "./rsrc/SHORT.FASTA")
    output_file = os.path.join(dir_path, "./rsrc/SHORT.FASTA.test.bam")

    records = SeqIO.parse(seqpath, 'fasta')

    reference = gapless(next(records))

    def allseqs(records):
        yield compute_cigar(reference, reference)
        for record in records:
            print(record)
            yield record

    def output(records):
        BamIO.write(allseqs(records), output_file, reference)

    _align_par(reference, records, BLOSUM62.load(), True, False, None, None,
               output, False)

    # Read output file
    BamIO.sort(output_file)
예제 #2
0
def run_group_alignment (sequence_group):

    print ("%d sequences with matching JUNCTION regions" % (len (sequence_group)  - 1))
    seqrecords = []

    for seq_id in sequence_group:
        #print ("Step 1\n%s" % sequence_group[seq_id])
        massaged_string = sequence_group[seq_id].replace ('NNN','').replace ('---','').replace ('-','N')
        #print ("Step 2\n%s" % massaged_string)
        if len (massaged_string) % 3:
            massaged_string = massaged_string [:len (massaged_string) - len (massaged_string) % 3]
            #print ("Step 3\n%s" % massaged_string)

        seqrecords.append(gapless(Bio.SeqRecord.SeqRecord (Bio.Seq.Seq(massaged_string), id = seq_id, name = seq_id, description = '')))

    if len (seqrecords) == 1:
        refseq = seqrecords[0].format ('fasta')
        return {'ref': refseq, 'alignment': refseq, 'seqs': seqrecords}

    # find the longest sequence
    seq_lengths = [len(record.seq) for record in seqrecords]
    refseq_id = seq_lengths.index(max(seq_lengths))
    refseq = seqrecords.pop(refseq_id)


    #print (len (seqrecords))

    if len(refseq.seq) % 3:
        seqrecords = [s for s in seqrecords]
        print (">ref\n%s" % str(refseq.seq))
        print ('\n'.join ([">%s\n%s" % (str(k.id), str(k.seq)) for k in seqrecords]))

    sm = BLOSUM62.load()

    msa, discarded = align_to_refseq(
        refseq,
        seqrecords,
        score_matrix=sm,
        do_codon=True,
        reverse_complement=False,
        #expected_identity=0.6,
        keep_insertions=False,
    )

    if len (discarded):
        print (">ref\n%s" % str(refseq.seq))
        print ('\n'.join ([">%s\n%s" % (str(k.id), str(k.seq)) for k in seqrecords]))
        print (discarded)
        raise Exception ("Non-empty discarded")
        sys.exit (1)

    string_buffer = io.StringIO ()
    Bio.SeqIO.write (msa, string_buffer, "fasta")
    all_lines = string_buffer.getvalue()
    string_buffer.close()
    return {'ref': refseq.format ('fasta'), 'alignment': all_lines, 'seqs': seqrecords}
예제 #3
0
파일: __init__.py 프로젝트: nlhepler/BioExt
    def __call__(
            self,
            ref,
            query,
            open_insertion=None,
            extend_insertion=None,
            open_deletion=None,
            extend_deletion=None,
            miscall_cost=None,
            do_local=None,
            do_affine=None
            ):

        # populate defaults from initialization
        if open_insertion is None:
            open_insertion = self.__open_insertion
        if extend_insertion is None:
            extend_insertion = self.__extend_insertion
        if open_deletion is None:
            open_deletion = self.__open_deletion
        if extend_deletion is None:
            extend_deletion = self.__extend_deletion
        if miscall_cost is None:
            miscall_cost = self.__miscall_cost
        if do_local is None:
            do_local = self.__do_local
        if do_affine is None:
            do_affine = self.__do_affine

        ref = gapless(ref)
        query = gapless(query)

        # if the reference and query are the same, we can return early
        if len(ref) and ref == query:
            if self.__do_codon:
                score = sum(self.__score_matrix[char, char] for char in _translate(ref))
            else:
                score = sum(self.__score_matrix[char, char] for char in ref)
            return score / len(ref), ref, query

        if isinstance(ref, SeqRecord):
            ref_ = str(ref.seq)
        elif isinstance(ref, Seq):
            ref_ = str(ref)
        else:
            ref_ = ref

        if isinstance(query, SeqRecord):
            query_ = str(query.seq)
        elif isinstance(query, Seq):
            query_ = str(query)
        else:
            query_ = query

        # convert to uppercase, because _align assumes it
        ref_ = ref_.upper()
        query_ = query_.upper()

        if self.__do_codon and len(ref_) % 3 != 0:
            raise ValueError('when do_codon = True, len(ref) must be a multiple of 3')

        # if do_codon, the query's length needs to be a multiple of 3
#         if self.__do_codon and len(query_) % 3 != 0:
#             ns = 3 - len(query_) % 3
#             query_ += 'N' * ns
#         else:
#             ns = 0

        # for shared memory safety, recreate matrices if the PID changed
        current_pid = getpid()
        if self.__cached_pid != current_pid:
            self.__cached_pid = current_pid
            self.__cached_score_matrix = np.empty((1,), dtype=float)
            self.__cached_deletion_matrix = np.empty((1,), dtype=float)
            self.__cached_insertion_matrix = np.empty((1,), dtype=float)

        if self.__do_codon:
            cache_size = (len(ref_) // 3 + 1) * (len(query_) + 1)
        else:
            cache_size = (len(ref_) + 1) * (len(query_) + 1)

        if self.__cached_score_matrix.shape[0] < cache_size:
            self.__cached_score_matrix.resize((cache_size,))

        if do_affine:
            if self.__cached_deletion_matrix.shape[0] < cache_size:
                self.__cached_deletion_matrix.resize((cache_size,))

            if self.__cached_insertion_matrix.shape[0] < cache_size:
                self.__cached_insertion_matrix.resize((cache_size,))

        if len(query) == 0:
            score, ref_aligned, query_aligned = float('-Inf'), ref_, '-' * len(ref_)
        else:
            score, ref_aligned, query_aligned = _align(
                ref_.encode('utf-8'),
                query_.encode('utf-8'),
                self.__nchars,
                self.__char_map,
                self.__score_matrix_,
                self.__score_matrix_.shape[0],
                open_insertion,
                extend_insertion,
                open_deletion,
                extend_deletion,
                miscall_cost,
                do_local,
                do_affine,
                self.__do_codon,
                self.__codon3x5,
                self.__codon3x4,
                self.__codon3x2,
                self.__codon3x1,
                self.__cached_score_matrix,
                self.__cached_deletion_matrix,
                self.__cached_insertion_matrix
                )

            if sys.version_info >= (3, 0):
                ref_aligned = ref_aligned.decode('utf-8')
                query_aligned = query_aligned.decode('utf-8')

        if isinstance(ref, SeqRecord):
            ref_aligned_ = SeqRecord(
                Seq(ref_aligned, ref.seq.alphabet),
                id=ref.id,
                name=ref.name,
                description=ref.description,
                dbxrefs=ref.dbxrefs,
                annotations=ref.annotations
                )
        elif isinstance(ref, Seq):
            ref_aligned_ = Seq(ref_aligned, ref.seq.alphabet)
        else:
            ref_aligned_ = ref_aligned

        if isinstance(query, SeqRecord):
            query_aligned_ = SeqRecord(
                Seq(query_aligned, query.seq.alphabet),
                id=query.id,
                name=query.name,
                description=query.description,
                dbxrefs=query.dbxrefs,
                annotations=query.annotations
                )
        elif isinstance(query, Seq):
            query_aligned_ = Seq(query_aligned, query.seq.alphabet)
        else:
            query_aligned_ = query_aligned

        # normalize score to per-position
        if len(query_):
            score /= (len(query_) / 3) if self.__do_codon else len(query_)

        return score, ref_aligned_, query_aligned_
예제 #4
0
파일: __init__.py 프로젝트: sdwfrost/BioExt
    def __call__(self,
                 ref,
                 query,
                 open_insertion=None,
                 extend_insertion=None,
                 open_deletion=None,
                 extend_deletion=None,
                 miscall_cost=None,
                 do_local=None,
                 do_affine=None):

        # populate defaults from initialization
        if open_insertion is None:
            open_insertion = self.__open_insertion
        if extend_insertion is None:
            extend_insertion = self.__extend_insertion
        if open_deletion is None:
            open_deletion = self.__open_deletion
        if extend_deletion is None:
            extend_deletion = self.__extend_deletion
        if miscall_cost is None:
            miscall_cost = self.__miscall_cost
        if do_local is None:
            do_local = self.__do_local
        if do_affine is None:
            do_affine = self.__do_affine

        ref = gapless(ref)
        query = gapless(query)

        # if the reference and query are the same, we can return early
        if len(ref) and ref == query:
            if self.__do_codon:
                score = sum(self.__score_matrix[char, char]
                            for char in _translate(ref))
            else:
                score = sum(self.__score_matrix[char, char] for char in ref)
            return score / len(ref), ref, query

        if isinstance(ref, SeqRecord):
            ref_ = str(ref.seq)
        elif isinstance(ref, Seq):
            ref_ = str(ref)
        else:
            ref_ = ref

        if isinstance(query, SeqRecord):
            query_ = str(query.seq)
        elif isinstance(query, Seq):
            query_ = str(query)
        else:
            query_ = query

        # convert to uppercase, because _align assumes it
        ref_ = ref_.upper()
        query_ = query_.upper()

        if self.__do_codon and len(ref_) % 3 != 0:
            raise ValueError(
                'when do_codon = True, len(ref) must be a multiple of 3')

        # if do_codon, the query's length needs to be a multiple of 3
#         if self.__do_codon and len(query_) % 3 != 0:
#             ns = 3 - len(query_) % 3
#             query_ += 'N' * ns
#         else:
#             ns = 0

# for shared memory safety, recreate matrices if the PID changed
        current_pid = getpid()
        if self.__cached_pid != current_pid:
            self.__cached_pid = current_pid
            self.__cached_score_matrix = np.empty((1, ), dtype=float)
            self.__cached_deletion_matrix = np.empty((1, ), dtype=float)
            self.__cached_insertion_matrix = np.empty((1, ), dtype=float)

        if self.__do_codon:
            cache_size = (len(ref_) // 3 + 1) * (len(query_) + 1)
        else:
            cache_size = (len(ref_) + 1) * (len(query_) + 1)

        if self.__cached_score_matrix.shape[0] < cache_size:
            self.__cached_score_matrix.resize((cache_size, ))

        if do_affine:
            if self.__cached_deletion_matrix.shape[0] < cache_size:
                self.__cached_deletion_matrix.resize((cache_size, ))

            if self.__cached_insertion_matrix.shape[0] < cache_size:
                self.__cached_insertion_matrix.resize((cache_size, ))

        if len(query) == 0:
            score, ref_aligned, query_aligned = float(
                '-Inf'), ref_, '-' * len(ref_)
        else:
            score, ref_aligned, query_aligned = _align(
                ref_.encode('utf-8'), query_.encode('utf-8'), self.__nchars,
                self.__char_map, self.__score_matrix_,
                self.__score_matrix_.shape[0], open_insertion,
                extend_insertion, open_deletion, extend_deletion, miscall_cost,
                do_local, do_affine, self.__do_codon, self.__codon3x5,
                self.__codon3x4, self.__codon3x2, self.__codon3x1,
                self.__cached_score_matrix, self.__cached_deletion_matrix,
                self.__cached_insertion_matrix)

            if sys.version_info >= (3, 0):
                ref_aligned = ref_aligned.decode('utf-8')
                query_aligned = query_aligned.decode('utf-8')

        if isinstance(ref, SeqRecord):
            ref_aligned_ = SeqRecord(Seq(ref_aligned, ref.seq.alphabet),
                                     id=ref.id,
                                     name=ref.name,
                                     description=ref.description,
                                     dbxrefs=ref.dbxrefs,
                                     annotations=ref.annotations)
        elif isinstance(ref, Seq):
            ref_aligned_ = Seq(ref_aligned, ref.seq.alphabet)
        else:
            ref_aligned_ = ref_aligned

        if isinstance(query, SeqRecord):
            query_aligned_ = SeqRecord(Seq(query_aligned, query.seq.alphabet),
                                       id=query.id,
                                       name=query.name,
                                       description=query.description,
                                       dbxrefs=query.dbxrefs,
                                       annotations=query.annotations)
        elif isinstance(query, Seq):
            query_aligned_ = Seq(query_aligned, query.seq.alphabet)
        else:
            query_aligned_ = query_aligned

        # normalize score to per-position
        if len(query_):
            score /= (len(query_) / 3) if self.__do_codon else len(query_)

        return score, ref_aligned_, query_aligned_
예제 #5
0
 def output(records):
     for record in records:
         alignment.append(gapful(gapless(record), insertions=False))
예제 #6
0
 def output(records):
     for record in records:
         alignment.append(suffix_pad(gapful(gapless(record), insertions=False)))
예제 #7
0
 def discard(record):
     SeqIO.write([gapless(record.upper())], discard_handle, 'fasta')
예제 #8
0
def main(
        input_handle,
        output_handle,
        reference,
        expected_identity,
        alphabet,
        reverse_complement,
        score_matrix,
        discard_handle,
        do_sort,
        quiet,
        globalStartingPoint, 
	    extendGapPenalty
        ):

    try:
        score_matrix_ = score_matrix.load()
    except:
        raise RuntimeError('could not load the score matrix')

    if ((alphabet == 'dna' and not isinstance(score_matrix, DNAScoreMatrix)) and
            not isinstance(score_matrix, ProteinScoreMatrix)):
        raise ValueError(
            'DNA alphabet requires a DNA score matrix, '
            'while amino and codon alphabets require a protein score matrix'
            )

    do_codon = alphabet == 'codon'

    records = SeqIO.parse(input_handle, 'fasta')

    # grab the first, make it gapless once and for all
    if reference is None:
        reference = gapless(next(records))
        def allseqs(records):
            yield compute_cigar(reference, reference)
            for record in records:
                yield record
    else:
        def allseqs(records):
            for record in records:
                yield record

    if discard_handle:
        def discard(record):
            SeqIO.write([gapless(record.upper())], discard_handle, 'fasta')
    else:
        discard = None

    def output(records):
        BamIO.write(
            allseqs(records),
            output_handle,
            reference
            )

    retcode = -1
    try:
        _align_par(
            reference,
            records,
            score_matrix_,
            do_codon,
            reverse_complement,
            expected_identity,
            discard,
            output,
            globalStartingPoint,
            extendGapPenalty,
            quiet
            )
        if do_sort:
            BamIO.sort(output_handle)
        retcode = 0
    except FrequenciesError:
        print(
            'supplied score-matrix does not imply a frequency distribution:',
            'please choose another matrix if you must filter on expected identity',
            file=sys.stderr
            )

    return retcode