def pairwise_align_dna(sequence, reference, regex_complied, gene):
    """
    Pairwise align sequence to reference, to find reading frame and frame-shift in/dels
    :param sequence: (str) a query DNA sequence
    :param reference: (str) a reference DNA sequence (must start in reading frame 1)
    :param regex_complied: (regex_obj) a compiled regex pattern
    :param gene: (str) the target gene (ENV, GAG, POL, etc...
    :return: (str) aligned query sequence, (str) aligned ref sequence, (int) reading frame for query sequence
    """
    # do overlap pairwise alignment to not get truncated query sequence

    if gene == "ENV":
        overlap = seqanpy.align_overlap(sequence, reference, band=-1, score_match=4, score_mismatch=-1, score_gapext=-3,
                                        score_gapopen=-14)
    else:
        # for other regions
        overlap = seqanpy.align_overlap(sequence, reference, band=-1, score_match=4, score_mismatch=-2, score_gapext=-3,
                                        score_gapopen=-14)
    overlap = list(overlap)
    seq_align = overlap[1]
    ref_align = overlap[2]
    # print(">sqseq1\n{}\n".format(seq_align))
    # print(">sqref1\n{}\n".format(ref_align))

    # get start position in the seq, if not starting at index 0
    if seq_align[0] == '-':
        seq_start = regex_complied.search(seq_align).end()
    else:
        seq_start = 0

    # get end position in the seq, if not starting at index 0
    if seq_align[-1] == '-':
        # reverse the string and find first non-gap character, mult match.end by -1 to get non-reversed index
        seq_end = (regex_complied.search(seq_align[::-1]).end()) * -1
    else:
        seq_end = None

    # ref start will be 0 for align_overlap
    if ref_align[0] == '-':
        ref_start = regex_complied.search(ref_align).end()
    else:
        ref_start = 0

    # calculate reading frame (reference must start in frame 0)
    frame = (seq_start - ref_start) % 3

    # truncate the overlap alignment to the region of interest
    seq_align = seq_align[seq_start:seq_end]
    ref_align = ref_align[seq_start:seq_end]
    # print(">sqseq2\n{}\n".format(seq_align))
    # print(">sqref2\n{}\n".format(ref_align))

    return seq_align, ref_align, frame
Example #2
0
    def get_consensus(self, region, PCR=1):
        '''Get consensus for this sample'''
        from Bio.Seq import Seq
        from Bio.SeqRecord import SeqRecord
        from Bio import SeqIO
        from seqanpy import align_overlap

        (fragment, start,
         stop) = self.get_fragmented_roi(region,
                                         VERBOSE=0,
                                         include_genomewide=True)

        seq = SeqIO.read(self.get_consensus_filename(fragment, PCR=PCR),
                         'fasta')
        refseq = SeqIO.read(
            self.get_reference_filename(fragment, format='fasta'),
            'fasta')[start:stop]

        score, ali1, ali2 = align_overlap(seq, refseq)
        start = len(ali2) - len(ali2.lstrip('-'))
        end = len(ali2.rstrip('-'))
        seq_region = ali1[start:end]

        seq = SeqRecord(Seq(seq_region, seq.seq.alphabet),
                        id=self.name,
                        name=self.name,
                        description=self.name)

        return seq
def merge_allele_counts(ref_genomewide, acs, VERBOSE=0):
    '''Merge the allele counts of all fragments
    
    Note: we do not require full coverage of all fragments, the missing
          ones will just have zero counts. Sometimes, cherry-picking the data
          fragment by fragment might be a better choice.
    '''
    from hivwholeseq.utils.miseq import alpha, read_types
    from seqanpy import align_overlap

    ac = np.zeros((len(read_types), len(alpha), len(ref_genomewide)), int)

    pos_ref = 1000
    for (fr, ref, acsi) in acs:

        # Find the coordinates
        (score, ali1, ali2) = align_overlap(ref_genomewide[pos_ref - 1000:],
                                            ref,
                                            #score_gapopen=-20,
                                           )
        fr_start = len(ali2) - len(ali2.lstrip('-'))
        fr_end = len(ali2.rstrip('-'))

        if VERBOSE:
            print fr, pos_ref - 1000 + fr_start, pos_ref - 1000 + fr_end

        # Scan the alignment
        pos_ref = pos_ref - 1000 + fr_start
        fr_start_ref = pos_ref
        fr_end_ref = pos_ref + fr_end - fr_start
        pos_fr = 0
        for pos_ali in xrange(fr_start, fr_end):
            # Gap in genomewise, ignore position
            if ali1[pos_ali] == '-':
                pos_fr += 1
                continue

            # Gap in fragment, ignore FIXME: probably we should put deletions
            elif ali2[pos_ali] == '-':
                pos_ref += 1
                continue

            # Add the counts
            # NOTE: all fragments are treated the same, even in case of coverage
            # differences of orders of magnitude. This means, larger coverage
            # always wins. Maybe we want to implement this somewhat differently
            ac[:, :, pos_ref] += acsi[:, :, pos_fr]
            pos_fr += 1
            pos_ref += 1

        if VERBOSE >= 3:
            from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
            cons = alpha[ac.sum(axis=0).argmax(axis=0)]
            pretty_print_pairwise_ali((ali1[fr_start: fr_end],
                                       cons[fr_start: fr_end]),
                                      name1='gw',
                                      name2=fr,
                                      width=100)

    return ac
Example #4
0
def get_distance_reads_sequence(seq,
                                reads,
                                VERBOSE=0,
                                score_match=3,
                                score_mismatch=-3):
    '''Get the distance in alignment score between read pairs and a sequence'''
    from seqanpy import align_overlap

    seqs = ''.join(seq)
    deltas = []
    for irp, read_pair in enumerate(reads):
        d = 0
        for read in read_pair:
            (score, alis, alir) = align_overlap(seqs,
                                                read.seq,
                                                score_match=score_match,
                                                score_mismatch=score_mismatch)
            start = len(alir) - len(alir.lstrip('-'))
            end = len(alir.rstrip('-'))
            scoremax = score_match * (end - start)
            delta = scoremax - score
            d += delta

        deltas.append(d)

    return deltas
def check_suspect(reads, consensi_foreign, deltamax=30, VERBOSE=0):
    '''Check suspicious reads for closer distance to potential contaminants'''
    if VERBOSE >= 2:
        print 'Checking suspect read pair:', reads[0].qname,

    for consensus in consensi_foreign:
        conss = ''.join(consensus)
        delta_foreign = 0
        for read in reads:
            (score, ali1, ali2) = align_overlap(conss, read.seq)
            scoremax = 3 * len(ali2.strip('-'))
            delta_foreign += scoremax - score
        
        # We classify as trash all reads that are within a basin of another seq
        if delta_foreign < deltamax:
            if VERBOSE >= 2:
                print ''
                print consensus.name, delta_foreign
            return True

        else:
            if VERBOSE >= 2:
                print 'OK',

    if VERBOSE >= 2:
        print ''

    return False
Example #6
0
def check_suspect(reads, consensi_foreign, deltamax=30, VERBOSE=0):
    '''Check suspicious reads for closer distance to potential contaminants'''
    if VERBOSE >= 2:
        print 'Checking suspect read pair:', reads[0].qname,

    for consensus in consensi_foreign:
        conss = ''.join(consensus)
        delta_foreign = 0
        for read in reads:
            (score, ali1, ali2) = align_overlap(conss, read.seq)
            scoremax = 3 * len(ali2.strip('-'))
            delta_foreign += scoremax - score

        # We classify as trash all reads that are within a basin of another seq
        if delta_foreign < deltamax:
            if VERBOSE >= 2:
                print ''
                print consensus.name, delta_foreign
            return True

        else:
            if VERBOSE >= 2:
                print 'OK',

    if VERBOSE >= 2:
        print ''

    return False
def ammend_fasta():
    with open(fname_unspliced, 'w') as ofile:
        strain_by_protein = defaultdict(dict)
        for seq in SeqIO.parse(fname, 'fasta'):
            SeqIO.write(seq, ofile, 'fasta')
            prot = seq.name.split('_')[0]
            seq_name = seq.description.split('|')[1]
            strain_by_protein[prot][seq_name] = seq

        joined_seqs = defaultdict(dict)
        splice_pairs = [('M', 'M1', 'M2'), ('NS', 'NS1', 'NS2')]
        #splice_pairs = [('M', 'M1', 'BM2')]
        from seqanpy import align_overlap
        for c, a,b in splice_pairs:
            for strain in strain_by_protein[a]:
                seq1 = strain_by_protein[a][strain]
                new_id = c+seq1.id.lstrip(a)
                new_description = c+seq1.description.lstrip(a)
                new_description = new_description.replace(a, c)
                new_name = c+seq1.name.lstrip(a)
                new_name = new_name.replace(a, c)
                try:
                    score, ali1, ali2 = align_overlap(seq1.seq, strain_by_protein[b][strain].seq,
                                            score_gapopen=-20, score_gapext=0)
                    ali_array = np.array([np.fromstring(x, 'S1') for x in [ali1, ali2]])
                    tmpseq = np.copy(ali_array[0])
                    tmpseq[ali_array[0]=='-']=ali_array[1][ali_array[0]=='-']
                    joined_seqs[c][strain] = SeqRecord.SeqRecord(seq=Seq.Seq("".join(tmpseq)), id=new_id,
                                                            description=new_description, name=new_name)
                    SeqIO.write(joined_seqs[c][strain], ofile, 'fasta')
                except:
                    print(seq1.name, "doesn't have a partner")
Example #8
0
def determine_lineage(seq):
	fields = map(lambda x:x.strip(), seq.description.split('|'))
	tmp_lineage = (fields[2], fields[4])
	if tmp_lineage in patterns:
		print fields[0],"\n\tgisaid defined lineage:",tmp_lineage,'->',patterns[tmp_lineage]
		return patterns[tmp_lineage]
	else:
		scores = []
		for olineage, oseq in outgroups.iteritems():
			if (params.aligner == "seqan"):
				from seqanpy import align_overlap
				tmp_aln = align_overlap(str(oseq.seq), str(seq.seq).replace('-','').upper(),
							score_gapopen=-10, score_gapext=-1)
				tmp_aln = np.array([np.fromstring(tmp_aln[1], dtype='|S1'), np.fromstring(tmp_aln[2], dtype='|S1')])
			if (params.aligner == "mafft"):
				SeqIO.write([oseq, seq], "temp_in.fasta", "fasta")
				os.system("mafft --auto temp_in.fasta > temp_out.fasta 2>tmp")
				tmp_aln = np.array(AlignIO.read('temp_out.fasta', 'fasta'))
			scores.append((olineage, (tmp_aln[0]==tmp_aln[1]).sum()))
		scores.sort(key = lambda x:x[1], reverse=True)
		if scores[0][1]>0.85*len(seq):
			print fields[0], tmp_lineage, len(seq), "\n\t lineage based on similarity:",scores[0][0],"\n\t",scores
			return scores[0][0]
		else:
			print fields[0], tmp_lineage, len(seq), "\n\t other: best scores:",scores[0]
			return 'other'
def align_to_initial(reads, p):
    from hivevo.sequence import alphal
    seg, start, stop = 'F1', int(p.annotation['p17'].location.start)-20, int(p.annotation['p17'].location.end)+7
    ref_seq = "".join(p.get_initial_sequence(seg)[start:stop])
    aft =  p.get_allele_frequency_trajectories('F1')[:,:,start:stop]
    founder_indices = np.array([alphal.index(nuc) for nuc in ref_seq])

    for read in reads:
        score, ali_ref, ali_read = align_overlap(ref_seq, str(read.seq.ungap('-')))
        tmp_ali = np.vstack([np.fromstring(a, 'S1') for a in (ali_ref, ali_read)])
        try:
            unconserved = ~((aft[:,:4,:].max(axis=1)>0.99).all(axis=0))
            reference_aln = (tmp_ali[0]!='-')
            ungapped = (~np.any(tmp_ali=='-', axis=0))[reference_aln]
            unamb = (~np.any(tmp_ali=='N', axis=0))[reference_aln]

            good_positions = ungapped&unamb&unconserved
            read_indices = np.array([alphal.index(nuc) for nuc in tmp_ali[1]])[reference_aln][good_positions]

            read.prob = np.sum(np.log(aft[:, read_indices, good_positions] + 0.001), axis=1)

            good_positions = ungapped&unamb
            good_positions[:20]=False
            good_positions[-7:]=False
            read_indices = np.array([alphal.index(nuc) for nuc in tmp_ali[1]])[reference_aln][good_positions]

            read.distance = np.mean((founder_indices[good_positions]!=read_indices))
        except:
            import ipdb; ipdb.set_trace();
Example #10
0
def determine_lineage(seq):
	fields = map(lambda x:x.strip(), seq.description.split('|'))
	tmp_lineage = (fields[2], fields[4])
	if tmp_lineage in patterns:
#		print fields[0],"\n\tgisaid defined lineage:",tmp_lineage,'->',patterns[tmp_lineage]
		return patterns[tmp_lineage]
	else:
		scores = []
		for olineage, oseq in outgroups.iteritems():
			if (params.aligner == "seqan"):
				from seqanpy import align_overlap
				tmp_aln = align_overlap(str(oseq.seq), str(seq.seq).replace('-','').upper(),
							score_gapopen=-10, score_gapext=-1)
				tmp_aln = np.array([np.fromstring(tmp_aln[1], dtype='|S1'), np.fromstring(tmp_aln[2], dtype='|S1')])
			if (params.aligner == "mafft"):
				SeqIO.write([oseq, seq], "temp_in.fasta", "fasta")
				os.system("mafft --auto temp_in.fasta > temp_out.fasta 2>tmp")
				tmp_aln = np.array(AlignIO.read('temp_out.fasta', 'fasta'))
			scores.append((olineage, (tmp_aln[0]==tmp_aln[1]).sum()))
		scores.sort(key = lambda x:x[1], reverse=True)
		if scores[0][1]>0.85*len(seq):
			print fields[0], tmp_lineage, len(seq), "\n\t lineage based on similarity:",scores[0][0],"\n\t",scores
			return scores[0][0]
		else:
			print fields[0], tmp_lineage, len(seq), "\n\t other: best scores:",scores[0]
			return 'other'
def get_gene_positions_in_fragment(gene, fragment, gwseq, fragseq, VERBOSE=0):
    '''Get the coordinates of a gene within a fragment'''
    # Find coordinates of gene in reference
    feagene = gwseq.features[map(attrgetter('id'), gwseq.features).index(gene)]
    gene_start = feagene.location.nofuzzy_start
    gene_end = feagene.location.nofuzzy_end

    # Sanity check on coordinates
    feafrag = refseq.features[map(attrgetter('id'),
                                  gwseq.features).index(fragment)]
    fragrefgw = feafrag.extract(gwseq)
    if len(fragseq) != len(fragrefgw):
        raise ValueError(
            'Problem with coordinates between fragment and genomewide.')

    # Find coordinates of gene in fragment
    frag_start = feafrag.location.nofuzzy_start
    frag_end = feafrag.location.nofuzzy_end

    # complete gene
    if (frag_start <= gene_start) and (frag_end >= gene_end):
        if VERBOSE >= 2:
            print 'Complete gene found'
        positions = np.arange(gene_start, gene_end) - frag_start

    # start of gene
    elif (frag_start <= gene_start):
        if VERBOSE >= 2:
            print 'WARNING: only gene start found'
        positions = np.arange(gene_start, frag_end) - frag_start
        if len(positions) % 3:
            positions = positions[:-(len(positions) % 3)]

    # end of gene
    elif (frag_end >= gene_end):
        if VERBOSE >= 2:
            print 'WARNING: only gene end found'
        positions = np.arange(frag_start, gene_end) - frag_start
        if len(positions) % 3:
            positions = positions[len(positions) % 3:]

    # middle of gene: guess reading frame
    else:
        if VERBOSE >= 2:
            print 'WARNING: only gene middle found'
        prot = feagene.extract(gwseq).seq.translate()
        ali_score = []
        for rf_start in xrange(3):
            tmpseq = fragseq[rf_start:].seq
            if len(tmpseq) % 3:
                tmpseq = tmpseq[:-(len(tmpseq) % 3)]
            tmpprot = tmpseq.translate()
            (score, ali1, ali2) = align_overlap(tmpprot, prot)
            ali_score.append(score)
        rf_start = np.argmax(ali_score)
        positions = np.arange(frag_start + rf_start, frag_end) - frag_start
        if len(positions) % 3:
            positions = positions[:-(len(positions) % 3)]

    return positions
def get_gene_positions_in_fragment(gene, fragment, gwseq, fragseq, VERBOSE=0):
    """Get the coordinates of a gene within a fragment"""
    # Find coordinates of gene in reference
    feagene = gwseq.features[map(attrgetter("id"), gwseq.features).index(gene)]
    gene_start = feagene.location.nofuzzy_start
    gene_end = feagene.location.nofuzzy_end

    # Sanity check on coordinates
    feafrag = refseq.features[map(attrgetter("id"), gwseq.features).index(fragment)]
    fragrefgw = feafrag.extract(gwseq)
    if len(fragseq) != len(fragrefgw):
        raise ValueError("Problem with coordinates between fragment and genomewide.")

    # Find coordinates of gene in fragment
    frag_start = feafrag.location.nofuzzy_start
    frag_end = feafrag.location.nofuzzy_end

    # complete gene
    if (frag_start <= gene_start) and (frag_end >= gene_end):
        if VERBOSE >= 2:
            print "Complete gene found"
        positions = np.arange(gene_start, gene_end) - frag_start

    # start of gene
    elif frag_start <= gene_start:
        if VERBOSE >= 2:
            print "WARNING: only gene start found"
        positions = np.arange(gene_start, frag_end) - frag_start
        if len(positions) % 3:
            positions = positions[: -(len(positions) % 3)]

    # end of gene
    elif frag_end >= gene_end:
        if VERBOSE >= 2:
            print "WARNING: only gene end found"
        positions = np.arange(frag_start, gene_end) - frag_start
        if len(positions) % 3:
            positions = positions[len(positions) % 3 :]

    # middle of gene: guess reading frame
    else:
        if VERBOSE >= 2:
            print "WARNING: only gene middle found"
        prot = feagene.extract(gwseq).seq.translate()
        ali_score = []
        for rf_start in xrange(3):
            tmpseq = fragseq[rf_start:].seq
            if len(tmpseq) % 3:
                tmpseq = tmpseq[: -(len(tmpseq) % 3)]
            tmpprot = tmpseq.translate()
            (score, ali1, ali2) = align_overlap(tmpprot, prot)
            ali_score.append(score)
        rf_start = np.argmax(ali_score)
        positions = np.arange(frag_start + rf_start, frag_end) - frag_start
        if len(positions) % 3:
            positions = positions[: -(len(positions) % 3)]

    return positions
Example #13
0
def trim_to_refseq(seq, refseq):
    '''Trim sequence to a reference sequence'''
    from seqanpy import align_overlap

    (score, ali1, ali2) = align_overlap(seq, refseq, score_gapopen=-20)
    start = len(ali2) - len(ali2.lstrip('-'))
    end = len(ali2.rstrip('-'))

    return seq[start: end]
Example #14
0
def align_pairwise(seq1, seq2):
    try:
        from seqanpy import align_overlap
        return align_overlap(seq1, seq2, **scoring_params)
    except ImportError:
        from Bio import pairwise2
        aln = pairwise2.align.globalms(seq1, seq2,
            scoring_params['score_match'], scoring_params['score_mismatch'],
            scoring_params['score_gapopen'], scoring_params['score_gapext'],
            penalize_end_gaps=False, one_alignment_only=True)[0]
        return aln[2], aln[0], aln[1]
    def align_dna(seqstr, refstr, require_full_cover=True):
        if require_full_cover:
            (score, alis, alir) = align_overlap(seqstr, refstr)
            start = len(alir) - len(alir.lstrip('-'))
            end = len(alir.rstrip('-'))
            alist = alis[start: end]
            alirt = alir[start: end]
        else:
            (score, alis, alir) = align_local(seqstr, refstr)
            reftrim = alir.replace('-', '')
            start = refstr.find(reftrim[:50])
            end = refstr.rfind(reftrim[-50:]) + len(reftrim[-50:])
            alist = ('N' * start) + alis + ('N' * (len(refstr) - end))
            alirt = refstr[:start] + alir + refstr[end:]

        return (alist, alirt)
    def align_dna(seqstr, refstr, require_full_cover=True):
        if require_full_cover:
            (score, alis, alir) = align_overlap(seqstr, refstr)
            start = len(alir) - len(alir.lstrip('-'))
            end = len(alir.rstrip('-'))
            alist = alis[start:end]
            alirt = alir[start:end]
        else:
            (score, alis, alir) = align_local(seqstr, refstr)
            reftrim = alir.replace('-', '')
            start = refstr.find(reftrim[:50])
            end = refstr.rfind(reftrim[-50:]) + len(reftrim[-50:])
            alist = ('N' * start) + alis + ('N' * (len(refstr) - end))
            alirt = refstr[:start] + alir + refstr[end:]

        return (alist, alirt)
def get_distance_reads_sequence(seq, reads, VERBOSE=0,
                                score_match=3,
                                score_mismatch=-3):
    '''Get the distance in alignment score between read pairs and a sequence'''
    from seqanpy import align_overlap

    seqs = ''.join(seq)
    deltas = []
    for irp, read_pair in enumerate(reads):
        d = 0
        for read in read_pair:
            (score, alis, alir) = align_overlap(seqs, read.seq,
                                                score_match=score_match,
                                                score_mismatch=score_mismatch)
            start = len(alir) - len(alir.lstrip('-'))
            end = len(alir.rstrip('-'))
            scoremax = score_match * (end - start)
            delta = scoremax - score
            d += delta

        deltas.append(d)

    return deltas
Example #18
0
# Script
if __name__ == '__main__':

    # Try import
    import seqanpy as sap

    # Global pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'AAATCGA'
    output = sap.align_global(seq1, seq2, band=5)
    print output

    # Overlap pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'ATCT'
    output = sap.align_overlap(seq1, seq2)
    print output

    # Overlap pairwise alignment cutting flanks
    seq1 = 'AAAGGTCTA'
    seq2 = 'ATCT'
    output = sap.align_overlap(seq1, seq2, cut_flanks=True)
    print output

    # Ladder pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'TCTAGGGAAACCC'
    output = sap.align_ladder(seq1, seq2)
    print output

    # Local pairwise alignment
Example #19
0
def overlap_test():
    print('Test align_overlap')
    import seqanpy
    (score, ali1, ali2) = seqanpy.align_overlap('ACCGT', 'CCG')
    assert ali1 == 'ACCGT'
    assert ali2 == '-CCG-'
                scoremax = 3 * len(ali1)
                delta = scoremax - score
                ali = [ali2, ali1]

                print 'Alignment to its own consensus (delta = '+str(delta)+')'
                pretty_print_pairwise_ali(ali,
                                          'cons',
                                          'read'+str(i+1)+' '+str(edge),
                                          len_name=25, width=90)
                print ''

                # Compare to all consensi and find the closest
                alifr = alis[fragment]
                alifrpw = []
                for cons in alifr:
                    alifrpw.append(align_overlap(cons.seq.ungap('-'), seq))
                scores = map(itemgetter(0), alifrpw)
                indmax = np.argmax(scores)

                alimax = alifrpw[indmax][1:]
                start = len(alimax[1]) - len(alimax[1].lstrip('-'))
                end = len(alimax[1].rstrip('-'))
                alimax = [s[start: end] for s in alimax]
                score = scores[indmax]
                scoremax = 3 * len(alimax[0])
                delta = scoremax - score

                name1 = ' '.join(['cons '] + alifr[indmax].name.split('_')[::2])
                name2 = ' '.join(['read'+str(i+1), pname, sample['patient sample']])
                print 'Alignment to best consensus (delta = '+str(delta)+')'
                pretty_print_pairwise_ali(alimax, name1, name2, len_name=25, width=90)
def get_minimal_distance_hist(bamfilename, consensi, maxreads=1000, VERBOSE=0):
    '''Get histogram of minimal distance of reads from consensi'''

    conssi = map(''.join, consensi)
    m = np.zeros(len(consensi), int)
    n_good = 0
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for irp, reads in enumerate(pair_generator(bamfile)):
            if n_good == maxreads:
                break

            if VERBOSE >= 3:
                print n_good + 1, 'Checking mindist for:', reads[0].qname,

            # Assign names
            (read1, read2) = reads
            i_fwd = reads[0].is_reverse

            # Check a few things to make sure we are looking at paired reads
            if read1.qname != read2.qname:
                raise ValueError('Read pair '+str(irp)+': reads have different names!')

            # Ignore unmapped reads
            if read1.is_unmapped or read2.is_unmapped:
                if VERBOSE >= 2:
                    print 'Read pair '+read1.qname+': unmapped'
                continue
            
            # Ignore not properly paired reads (this includes mates sitting on
            # different fragments)
            if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                if VERBOSE >= 2:
                    print 'Read pair '+read1.qname+': not properly paired'
                continue

            n_good += 1

            # Get all distances
            ds_pair = np.zeros_like(m)
            for ic, consensus in enumerate(consensi):
                conss = conssi[ic]
                dpair = 0
                for read in reads:
                    seq = read.seq
                    ali = align_overlap(conss, seq)
    
                    # NOTE: it is possible that we start before conss' start or end after
                    # its end, but that IS evidence that it's not contamination from there.
    
                    pos = conss.find(ali[1].replace('-', ''))
                    alim0 = np.fromstring(ali[1], 'S1')
                    alim1 = np.fromstring(ali[2], 'S1')
    
                    # Score subst
                    d = ((alim0 != alim1) & (alim0 != '-') & (alim1 != '-')).sum()
    
                    # Score insertions
                    gaps = alim0 == '-'
                    if gaps.sum():
                        n_gaps_borders = np.diff(gaps).sum()
                        n_gaps_borders += alim0[0] == '-'
                        n_gaps_borders += alim0[-1] == '-'
                        n_insertions = n_gaps_borders // 2
                        d += n_insertions
    
                    # Score deletions
                    gaps = alim1 == '-'
                    if gaps.sum():
                        n_gaps_borders = np.diff(gaps).sum()
                        n_gaps_borders -= alim1[0] == '-'
                        n_gaps_borders -= alim1[-1] == '-'
                        n_deletions = n_gaps_borders // 2
                        d += n_deletions
    
                    dpair += d
    
                ds_pair[ic] = dpair

                if VERBOSE >= 3:
                    print 'OK',

            m[ds_pair.argmin()] += 1
            if VERBOSE >= 3:
                print ''

    return m
def filter_contamination(
    bamfilename,
    bamfilename_out,
    contseqs,
    samplename,
    VERBOSE=0,
    deltascore_max_self=60,
    deltascore_max_other=24,
    maxreads=-1,
    **kwargs
):
    """Fish contaminated reads from mapped reads

    The function checks for a maximal distance to the expected consensus, and only
    if it's more than that it checks all other samples.
    
    Args:
      deltascore_max_self (int): the maximal delta in alignment score to the 
                                 consensus to be considered pure
      deltascore_max_other (int): the maximal delta in alignment score to any other
                                  sample to be considered a contamination
      **kwargs: passed down to the pairwise alignment function
    """
    import pysam
    from collections import defaultdict
    from operator import itemgetter
    from seqanpy import align_overlap

    from hivwholeseq.utils.mapping import pair_generator, get_number_reads

    if "score_match" in kwargs:
        score_match = kwargs["score_match"]
    else:
        score_match = 3

    bamfilename_trash = bamfilename_out[:-4] + "_trashed.bam"

    contseqs = contseqs.copy()
    consseq = contseqs.pop(samplename)

    if VERBOSE >= 2:
        print "Scanning reads (" + str(get_number_reads(bamfilename) // 2) + ")"

    with pysam.Samfile(bamfilename, "rb") as bamfile:
        with pysam.Samfile(bamfilename_out, "wb", template=bamfile) as bamfileout, pysam.Samfile(
            bamfilename_trash, "wb", template=bamfile
        ) as bamfiletrash:
            n_good = 0
            n_cont = defaultdict(int)

            for irp, reads in enumerate(pair_generator(bamfile)):
                if irp == maxreads:
                    break

                if VERBOSE >= 2:
                    if not ((irp + 1) % 100):
                        if not ((irp + 1) == 100):
                            sys.stdout.write("\x1b[1A")
                        print irp + 1

                for read in reads:

                    # Look for distance to the own consensus, it that's small move on
                    alignments_read = {}
                    deltas_read = {}
                    (score, alis1, alis2) = align_overlap(consseq, read.seq, **kwargs)
                    (alis1, alis2) = trim_align_overlap((alis1, alis2))
                    scoremax = len(alis1) * score_match
                    delta_read = scoremax - score
                    deltas_read[samplename] = delta_read
                    alignments_read[samplename] = (alis1, alis2)
                    if delta_read <= deltascore_max_self:
                        if VERBOSE >= 4:
                            print "Read is very close to its own consensus", scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2], width=90, name1="ref", name2="read")
                        continue

                    # Otherwise, move on to all other sequences and find the neighbour
                    for contname, contseq in contseqs.iteritems():
                        (score, ali1, ali2) = align_overlap(contseq, read.seq, **kwargs)
                        (ali1, ali2) = trim_align_overlap((ali1, ali2))
                        scoremax = len(ali1) * score_match
                        delta_read = scoremax - score
                        deltas_read[contname] = delta_read
                        alignments_read[contname] = (ali1, ali2)

                    if VERBOSE >= 5:
                        print samplename
                        for key, d in deltas_read.iteritems():
                            print key, d

                    (contname, delta_read) = min(deltas_read.iteritems(), key=itemgetter(1))

                    # Again, the correct consensus has precedence
                    if deltas_read[samplename] == delta_read:
                        contname = samplename

                    (ali1, ali2) = alignments_read[contname]

                    # The read may be closest to its own consensus, if not very close
                    if contname == samplename:
                        if VERBOSE >= 4:
                            print "Read is closest to its consensus", scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read")

                    # The read may come from another consensus (contamination)
                    elif delta_read <= deltascore_max_other:
                        n_cont[contname] += 1
                        bamfiletrash.write(reads[0])
                        bamfiletrash.write(reads[1])

                        if VERBOSE >= 2:
                            print "Contaminated read found! Good:", n_good, "cont:", sum(
                                n_cont.itervalues()
                            ), "sources:", n_cont

                        if VERBOSE >= 3:
                            print "Read is contaminated by", contname, scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2], width=90, name1="self", name2="read")
                            print ""
                            pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read")

                        if VERBOSE >= 2:
                            print ""

                        break

                    # Finally, the read is not really close to anything: accept
                    else:
                        if VERBOSE >= 4:
                            print "Read is close to nothing really", scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read")

                else:
                    n_good += 1
                    bamfileout.write(reads[0])
                    bamfileout.write(reads[1])

    n_cont = dict(n_cont)

    return (n_good, n_cont)
def merge_allele_counts(ref_genomewide, acs, VERBOSE=0):
    '''Merge the allele counts of all fragments
    
    Note: we do not require full coverage of all fragments, the missing
          ones will just have zero counts. Sometimes, cherry-picking the data
          fragment by fragment might be a better choice.
    '''
    from hivwholeseq.utils.miseq import alpha, read_types
    from seqanpy import align_overlap

    ac = np.zeros((len(read_types), len(alpha), len(ref_genomewide)), int)

    pos_ref = 1000
    for (fr, ref, acsi) in acs:

        # Find the coordinates
        (score, ali1, ali2) = align_overlap(
            ref_genomewide[pos_ref - 1000:],
            ref,
            #score_gapopen=-20,
        )
        fr_start = len(ali2) - len(ali2.lstrip('-'))
        fr_end = len(ali2.rstrip('-'))

        if VERBOSE:
            print fr, pos_ref - 1000 + fr_start, pos_ref - 1000 + fr_end

        # Scan the alignment
        pos_ref = pos_ref - 1000 + fr_start
        fr_start_ref = pos_ref
        fr_end_ref = pos_ref + fr_end - fr_start
        pos_fr = 0
        for pos_ali in xrange(fr_start, fr_end):
            # Gap in genomewise, ignore position
            if ali1[pos_ali] == '-':
                pos_fr += 1
                continue

            # Gap in fragment, ignore FIXME: probably we should put deletions
            elif ali2[pos_ali] == '-':
                pos_ref += 1
                continue

            # Add the counts
            # NOTE: all fragments are treated the same, even in case of coverage
            # differences of orders of magnitude. This means, larger coverage
            # always wins. Maybe we want to implement this somewhat differently
            ac[:, :, pos_ref] += acsi[:, :, pos_fr]
            pos_fr += 1
            pos_ref += 1

        if VERBOSE >= 3:
            from hivwholeseq.utils.sequence import pretty_print_pairwise_ali
            cons = alpha[ac.sum(axis=0).argmax(axis=0)]
            pretty_print_pairwise_ali(
                (ali1[fr_start:fr_end], cons[fr_start:fr_end]),
                name1='gw',
                name2=fr,
                width=100)

    return ac
Example #24
0
def annotate_sequence(seqrecord,
                      additional_edges={},
                      additional_features=['chunk'],
                      VERBOSE=0):
    '''Annotate a consensus with the genes and stuff (in place)'''
    # TODO: what do we do with genes that do not start/end where they are
    # supposed to? Do we follow biology and track their new locations?
    from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
    from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \
            other_edges, find_region_edges, find_region_edges_multiple, \
            locate_gene
    edge_dict = {
        'gene': gene_edges,
        'RNA structure': RNA_structure_edges,
        'other': other_edges
    }
    edge_dict.update(additional_edges)
    additional_features = ['protein'] + additional_features
    features = edge_dict.keys() + additional_features

    if VERBOSE:
        print 'Features:', ', '.join(features)

    smat = np.array(seqrecord)

    for feature_type in edge_dict:
        edges_all = edge_dict[feature_type]
        print feature_type, edge_dict[feature_type].keys()
        for name, edges in edges_all.iteritems():
            if VERBOSE >= 2:
                print name,

            # Skip a feature if it's present already
            if name in map(lambda x: x.id, seqrecord.features):
                if VERBOSE >= 2:
                    print 'already present.'
                continue

            # Behave differently for unsplit regions and split ones
            if len(edges) == 2:
                # LTR problems with F6
                if 'F6' in name:
                    pos_edge = find_region_edges(smat[6000::],
                                                 [edges[0], None])
                    pos_edge[0] += 6000
                elif feature_type == 'genes':
                    pos_edge = locate_gene(smat, name, output_compact=True)
                else:
                    pos_edge = find_region_edges(smat, edges)

                # Cut the primers for some features
                if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']:
                    pos_edge[0] += len(edges[0])
                    pos_edge[1] -= len(edges[1])

                # Cut only the right primer for V2
                if (None not in pos_edge) and name in ['V2']:
                    pos_edge[1] -= len(edges[1])

                if pos_edge[0] is None:
                    if name not in ['F1', "LTR5'"]:
                        print 'WARNING: start not found'
                    pos_edge[0] = 0

                if pos_edge[1] is None:
                    if name not in ['F6', "LTR3'"]:
                        print 'WARNING: end not found'
                    pos_edge[1] = len(smat)

                location = FeatureLocation(*pos_edge)
            else:
                if feature_type == 'genes':
                    pos_edges = [
                        locate_gene(smat, name + suff, output_compact=True)
                        for suff in ('1', '2')
                    ]
                else:
                    pos_edges = find_region_edges_multiple(smat,
                                                           edges,
                                                           min_distance=1)
                locations = [
                    FeatureLocation(*pos_edge) for pos_edge in pos_edges
                ]
                location = CompoundLocation(locations)

            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location,
                                 type=feature_type,
                                 id=name,
                                 strand=1)
            seqrecord.features.append(feature)

    # Add proteins and other features from HXB2
    from operator import attrgetter
    from seqanpy import align_overlap
    from hivwholeseq.utils.genome_info import proteins, chunks
    from hivwholeseq.reference import load_custom_reference
    additional_features_dict = {}
    if 'protein' in additional_features:
        additional_features_dict['protein'] = proteins
    if 'chunk' in additional_features:
        additional_features_dict['chunk'] = chunks

    ref_ann = load_custom_reference('HXB2', 'gb')
    for feagroup, additional_features_grp in additional_features_dict.iteritems(
    ):
        for feaname in additional_features_grp:
            if VERBOSE >= 2:
                print feaname,

            fea = ref_ann.features[map(attrgetter('id'),
                                       ref_ann.features).index(feaname)]
            seq = fea.extract(ref_ann)
            (score, ali1, ali2) = align_overlap(seqrecord,
                                                seq,
                                                score_gapopen=-20)
            start = len(ali2) - len(ali2.lstrip('-'))
            end = len(ali2.rstrip('-'))
            end -= ali1[start:end].count('-')

            location = FeatureLocation(start, end)
            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location, type=feagroup, id=feaname, strand=1)
            seqrecord.features.append(feature)
def filter_contamination(bamfilename,
                         bamfilename_out,
                         contseqs,
                         samplename,
                         VERBOSE=0,
                         deltascore_max_self=60,
                         deltascore_max_other=24,
                         maxreads=-1,
                         **kwargs):
    '''Fish contaminated reads from mapped reads

    The function checks for a maximal distance to the expected consensus, and only
    if it's more than that it checks all other samples.
    
    Args:
      deltascore_max_self (int): the maximal delta in alignment score to the 
                                 consensus to be considered pure
      deltascore_max_other (int): the maximal delta in alignment score to any other
                                  sample to be considered a contamination
      **kwargs: passed down to the pairwise alignment function
    '''
    import pysam
    from collections import defaultdict
    from operator import itemgetter
    from seqanpy import align_overlap

    from hivwholeseq.utils.mapping import pair_generator, get_number_reads

    if 'score_match' in kwargs:
        score_match = kwargs['score_match']
    else:
        score_match = 3

    bamfilename_trash = bamfilename_out[:-4] + '_trashed.bam'

    contseqs = contseqs.copy()
    consseq = contseqs.pop(samplename)

    if VERBOSE >= 2:
        print 'Scanning reads (' + str(
            get_number_reads(bamfilename) // 2) + ')'

    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        with pysam.Samfile(bamfilename_out, 'wb', template=bamfile) as bamfileout, \
             pysam.Samfile(bamfilename_trash, 'wb', template=bamfile) as bamfiletrash:
            n_good = 0
            n_cont = defaultdict(int)

            for irp, reads in enumerate(pair_generator(bamfile)):
                if irp == maxreads:
                    break

                if VERBOSE >= 2:
                    if not ((irp + 1) % 100):
                        if not ((irp + 1) == 100):
                            sys.stdout.write('\x1b[1A')
                        print irp + 1

                for read in reads:

                    # Look for distance to the own consensus, it that's small move on
                    alignments_read = {}
                    deltas_read = {}
                    (score, alis1,
                     alis2) = align_overlap(consseq, read.seq, **kwargs)
                    (alis1, alis2) = trim_align_overlap((alis1, alis2))
                    scoremax = len(alis1) * score_match
                    delta_read = scoremax - score
                    deltas_read[samplename] = delta_read
                    alignments_read[samplename] = (alis1, alis2)
                    if delta_read <= deltascore_max_self:
                        if VERBOSE >= 4:
                            print 'Read is very close to its own consensus', scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')
                        continue

                    # Otherwise, move on to all other sequences and find the neighbour
                    for contname, contseq in contseqs.iteritems():
                        (score, ali1,
                         ali2) = align_overlap(contseq, read.seq, **kwargs)
                        (ali1, ali2) = trim_align_overlap((ali1, ali2))
                        scoremax = len(ali1) * score_match
                        delta_read = scoremax - score
                        deltas_read[contname] = delta_read
                        alignments_read[contname] = (ali1, ali2)

                    if VERBOSE >= 5:
                        print samplename
                        for key, d in deltas_read.iteritems():
                            print key, d

                    (contname, delta_read) = min(deltas_read.iteritems(),
                                                 key=itemgetter(1))

                    # Again, the correct consensus has precedence
                    if deltas_read[samplename] == delta_read:
                        contname = samplename

                    (ali1, ali2) = alignments_read[contname]

                    # The read may be closest to its own consensus, if not very close
                    if contname == samplename:
                        if VERBOSE >= 4:
                            print 'Read is closest to its consensus', scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')

                    # The read may come from another consensus (contamination)
                    elif (delta_read <= deltascore_max_other):
                        n_cont[contname] += 1
                        bamfiletrash.write(reads[0])
                        bamfiletrash.write(reads[1])

                        if VERBOSE >= 2:
                            print 'Contaminated read found! Good:', n_good, 'cont:', sum(
                                n_cont.itervalues()), 'sources:', n_cont

                        if VERBOSE >= 3:
                            print 'Read is contaminated by', contname, scoremax, score, delta_read
                            pretty_print_pairwise_ali([alis1, alis2],
                                                      width=90,
                                                      name1='self',
                                                      name2='read')
                            print ''
                            pretty_print_pairwise_ali([ali1, ali2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')

                        if VERBOSE >= 2:
                            print ''

                        break

                    # Finally, the read is not really close to anything: accept
                    else:
                        if VERBOSE >= 4:
                            print 'Read is close to nothing really', scoremax, score, delta_read
                            pretty_print_pairwise_ali([ali1, ali2],
                                                      width=90,
                                                      name1='ref',
                                                      name2='read')

                else:
                    n_good += 1
                    bamfileout.write(reads[0])
                    bamfileout.write(reads[1])

    n_cont = dict(n_cont)

    return (n_good, n_cont)
Example #26
0
# Script
if __name__ == '__main__':

    # Try import
    import seqanpy as sap

    # Global pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'AAATCGA'
    output = sap.align_global(seq1, seq2, band=5)
    print output

    # Overlap pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'ATCT'
    output = sap.align_overlap(seq1, seq2)
    print output

    # Ladder pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'TCTAGGGAAACCC'
    output = sap.align_ladder(seq1, seq2)
    print output

    # Local pairwise alignment
    seq1 = 'AAAGGTCTACCGTAGCCT'
    seq2 = 'AAGTCTAC'
    output = sap.align_local(seq1, seq2)
    print output
def annotate_sequence(seqrecord, additional_edges={}, additional_features=['chunk'], VERBOSE=0):
    '''Annotate a consensus with the genes and stuff (in place)'''
    # TODO: what do we do with genes that do not start/end where they are
    # supposed to? Do we follow biology and track their new locations?
    from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
    from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \
            other_edges, find_region_edges, find_region_edges_multiple, \
            locate_gene
    edge_dict = {'gene': gene_edges,
                 'RNA structure': RNA_structure_edges,
                 'other': other_edges}
    edge_dict.update(additional_edges)
    additional_features = ['protein'] + additional_features
    features = edge_dict.keys() + additional_features

    if VERBOSE:
        print 'Features:', ', '.join(features)

    smat = np.array(seqrecord)

    for feature_type in edge_dict:
        edges_all = edge_dict[feature_type]
        print feature_type, edge_dict[feature_type].keys()
        for name, edges in edges_all.iteritems():
            if VERBOSE >= 2:
                print name,

            # Skip a feature if it's present already
            if name in map(lambda x: x.id, seqrecord.features):
                if VERBOSE >= 2:
                    print 'already present.'
                continue

            # Behave differently for unsplit regions and split ones
            if len(edges) == 2:
                # LTR problems with F6
                if 'F6' in name:
                    pos_edge = find_region_edges(smat[6000::], [edges[0], None])
                    pos_edge[0] += 6000
                elif feature_type == 'genes':
                    pos_edge = locate_gene(smat, name, output_compact=True)
                else:
                    pos_edge = find_region_edges(smat, edges)

                # Cut the primers for some features
                if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']:
                    pos_edge[0] += len(edges[0])
                    pos_edge[1] -= len(edges[1])

                # Cut only the right primer for V2
                if (None not in pos_edge) and name in ['V2']:
                    pos_edge[1] -= len(edges[1])

                if pos_edge[0] is None:
                    if name not in ['F1', "LTR5'"]:
                        print 'WARNING: start not found'
                    pos_edge[0] = 0

                if pos_edge[1] is None:
                    if name not in ['F6', "LTR3'"]:
                        print 'WARNING: end not found'
                    pos_edge[1] = len(smat)

                location = FeatureLocation(*pos_edge)
            else:
                if feature_type == 'genes':
                    pos_edges = [locate_gene(smat, name+suff, output_compact=True)
                                 for suff in ('1', '2')]
                else:
                    pos_edges = find_region_edges_multiple(smat, edges, min_distance=1)
                locations = [FeatureLocation(*pos_edge) for pos_edge in pos_edges]
                location = CompoundLocation(locations)

            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location, type=feature_type, id=name, strand=1)
            seqrecord.features.append(feature)

    # Add proteins and other features from HXB2
    from operator import attrgetter
    from seqanpy import align_overlap
    from hivwholeseq.utils.genome_info import proteins, chunks
    from hivwholeseq.reference import load_custom_reference
    additional_features_dict = {}
    if 'protein' in additional_features:
        additional_features_dict['protein'] = proteins
    if 'chunk' in additional_features:
        additional_features_dict['chunk'] = chunks

    ref_ann = load_custom_reference('HXB2', 'gb')
    for feagroup, additional_features_grp in additional_features_dict.iteritems():
        for feaname in additional_features_grp:
            if VERBOSE >= 2:
                print feaname,

            fea = ref_ann.features[map(attrgetter('id'), ref_ann.features).index(feaname)]
            seq = fea.extract(ref_ann)
            (score, ali1, ali2) = align_overlap(seqrecord, seq, score_gapopen=-20)
            start = len(ali2) - len(ali2.lstrip('-'))
            end = len(ali2.rstrip('-'))
            end -= ali1[start: end].count('-')

            location = FeatureLocation(start, end)
            if VERBOSE >= 2:
                print 'found:', location

            feature = SeqFeature(location, type=feagroup, id=feaname, strand=1)
            seqrecord.features.append(feature)
            ]
            seq = ''.join(seql)
            s, a1, a2 = align_local(prot['seq_aa'], seq)
            scores.append(s)
        chain = list(struc.get_chains())[np.argmax(s)]
        seql = [d3to1.get(r.get_resname(), 'O') for r in chain.get_residues()]
        seq = ''.join(seql)

        # Flag all mutations
        for m, mut in muts.iterrows():
            mutations.at[m, 'PDB_fn'] = fns[protname]
            mutations.at[m, 'PDB_id'] = fns[protname].split(
                '_')[1].upper().split('.')[0]
            mutations.at[m, 'PDB_chain'] = chain.id

            s, a1, a2 = align_overlap(seq, mut['context_protein'])
            # The focal allele is always small and is the only such letter
            pos_in_context = 4
            pos = a2.find(mut['context_protein'][pos_in_context])
            pos -= a1[:pos].count('-')
            mutations.at[m, 'PDB_pos_in_chain'] = pos
            mutations.at[m, 'PDB_allele'] = seq[pos]
            print('Ref: ' + mut['context_protein'])
            print('PDB: ' + seq[pos - 4:pos] + seq[pos].lower() +
                  seq[pos + 1:pos + 5])

    #mutations.to_csv('../data/mutations_highvariance_summary.tsv', sep='\t', index=True)

    print('Load multiple sequence alignments')
    from Bio import AlignIO
    protname = 'NS1'
Example #29
0
def get_minimal_distance_hist(bamfilename, consensi, maxreads=1000, VERBOSE=0):
    '''Get histogram of minimal distance of reads from consensi'''

    conssi = map(''.join, consensi)
    m = np.zeros(len(consensi), int)
    n_good = 0
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for irp, reads in enumerate(pair_generator(bamfile)):
            if n_good == maxreads:
                break

            if VERBOSE >= 3:
                print n_good + 1, 'Checking mindist for:', reads[0].qname,

            # Assign names
            (read1, read2) = reads
            i_fwd = reads[0].is_reverse

            # Check a few things to make sure we are looking at paired reads
            if read1.qname != read2.qname:
                raise ValueError('Read pair ' + str(irp) +
                                 ': reads have different names!')

            # Ignore unmapped reads
            if read1.is_unmapped or read2.is_unmapped:
                if VERBOSE >= 2:
                    print 'Read pair ' + read1.qname + ': unmapped'
                continue

            # Ignore not properly paired reads (this includes mates sitting on
            # different fragments)
            if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                if VERBOSE >= 2:
                    print 'Read pair ' + read1.qname + ': not properly paired'
                continue

            n_good += 1

            # Get all distances
            ds_pair = np.zeros_like(m)
            for ic, consensus in enumerate(consensi):
                conss = conssi[ic]
                dpair = 0
                for read in reads:
                    seq = read.seq
                    ali = align_overlap(conss, seq)

                    # NOTE: it is possible that we start before conss' start or end after
                    # its end, but that IS evidence that it's not contamination from there.

                    pos = conss.find(ali[1].replace('-', ''))
                    alim0 = np.fromstring(ali[1], 'S1')
                    alim1 = np.fromstring(ali[2], 'S1')

                    # Score subst
                    d = ((alim0 != alim1) & (alim0 != '-') &
                         (alim1 != '-')).sum()

                    # Score insertions
                    gaps = alim0 == '-'
                    if gaps.sum():
                        n_gaps_borders = np.diff(gaps).sum()
                        n_gaps_borders += alim0[0] == '-'
                        n_gaps_borders += alim0[-1] == '-'
                        n_insertions = n_gaps_borders // 2
                        d += n_insertions

                    # Score deletions
                    gaps = alim1 == '-'
                    if gaps.sum():
                        n_gaps_borders = np.diff(gaps).sum()
                        n_gaps_borders -= alim1[0] == '-'
                        n_gaps_borders -= alim1[-1] == '-'
                        n_deletions = n_gaps_borders // 2
                        d += n_deletions

                    dpair += d

                ds_pair[ic] = dpair

                if VERBOSE >= 3:
                    print 'OK',

            m[ds_pair.argmin()] += 1
            if VERBOSE >= 3:
                print ''

    return m
Example #30
0
# Script
if __name__ == '__main__':

    # Try import
    import seqanpy as sap

    # Global pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'AAATCGA'
    output = sap.align_global(seq1, seq2, band=5)
    print output

    # Overlap pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'ATCT'
    output = sap.align_overlap(seq1, seq2)
    print output

    # Ladder pairwise alignment
    seq1 = 'AAAGGTCTA'
    seq2 = 'TCTAGGGAAACCC'
    output = sap.align_ladder(seq1, seq2)
    print output

    # Local pairwise alignment
    seq1 = 'AAAGGTCTACCGTAGCCT'
    seq2 = 'AAGTCTAC'
    output = sap.align_local(seq1, seq2)
    print output
    patient = load_patient(pname)
    refseqgw = patient.get_reference('genomewide')

    for fragment in fragments:
        if VERBOSE >= 1:
            print pname, fragment

        if VERBOSE >= 2:
            print 'Cutting out fragment', fragment

        # Get start coordinate
        if fragment == 'F1':
            start = 0
        else:
            prfwd = primers_outer[fragment][0]
            (score, ali1, ali2) = align_overlap(refseqgw, prfwd, score_gapopen=-20)
            start = len(ali2) - len(ali2.lstrip('-')) + len(prfwd)

        # Get end coordinate
        if fragment == 'F6':
            end = len(refseqgw)
        else:
            prrev = primers_outer[fragment][1]
            (score, ali1, ali2) = align_overlap(refseqgw, prrev, score_gapopen=-20)
            end = len(ali2) - len(ali2.lstrip('-'))

        refseq = refseqgw[start: end]

        refseq.id = patient.code+'_ref_'+fragment
        refseq.name = refseq.id
        refseq.description = 'Patient '+patient.code+', initial reference '+fragment