def merge_seqs(seq1, seq2, overlap): '''Merge seq1 and seq2 assuming there are 'overlap' bases in common. Aligns the last 'overlap' bases of seq1 with the first 'overlap' bases of seq2, and if they align well enough, it joins them halfway. Args: seq1 (string): first sequence, goes on the left seq2 (string): second sequence, on the right overlap (int): usually 1000 Returns: Joined sequence ''' # assumes the last "overlap" bases of seq1 and the first "overlap" bases of seq2 overlap i0 = -overlap i1 = overlap if len(seq1) < overlap: i0 = 0 if len(seq2) < overlap: i1 = len(seq2)-1 acc,inds = swalign(seq1[i0:],seq2[:i1]) # put some constraints on the accuracy if acc < 0.70: raise Exception('Insufficient accuracy for overlap') # now put the break point halfway between inds = [x for x in inds if x[0]>0 and x[1]>0] imid = inds[int(len(inds)/2)] i0 += imid[0] i1 = imid[1] return seq1[:i0] + seq2[i1:]
def Mutate(fastafile, bamfile, fast5dir, region=None, params={}, verbose=0, test=False, reps=4): """Run consensus-calling mutations given required info. This function is the main one called by the setuptools entry points for consensus calling, that is, main() -> consensus() -> Mutate(...) -> PSAlign.Mutate(...) Use this function if you want to basically run poreseq consensus from within Python. It takes care of loading events and mutating in different ways, and if the test flag is specified, it starts with a low-accuracy 2D sequence instead of the ref. Args: fastafile (string): reference fasta file (alignment reference) bamfile (string): BAM-formatted file of alignments to reference fast5dir (string): folder where fast5 files are contained region (string): region string, or None for whole reference params (param dict): parameters to use for loading verbose (int): 0 for silent, 1 for steps, 2 for full mutations test (boolean): start with low-accuracy 2D sequence? reps (int): number of iterations for mutation and refinement Returns: tuple(sequence (string), acc (float)): mutated higher-accuracy consensus sequence along with its accuracy relative to the reference """ if 'verbose' not in params: params['verbose'] = 0 pa = LoadAlignedEvents(fastafile,bamfile,fast5dir,RegionInfo(region),params) # and the loaded reference sequence refseq = pa.sequence # test automatically sets verbose if test and verbose == 0: verbose = 1 # we know our algorithm doesn't do great for 1 or 2 events # in which case we can just shortcut and return the starting seq if len(pa.events) < 5: if verbose > 0: sys.stderr.write("Coverage is 1 or 2, not mutating...\n") return (refseq, 100) if verbose > 0: sys.stderr.write("Mutating {} bases using {} events\n".format(len(refseq),len(pa.events))) # if test mode, pick a sequence from event sequences if test: seq = "" for ev in pa.events: pairs = poreseqcpp.swalign(ev.sequence,refseq)[1] if pairs[-1][1]-pairs[0][1] > len(seq): seq = ev.sequence[pairs[0][0]:pairs[-1][0]] pa.sequence = seq if test: sys.stderr.write("Starting accuracy: " + str(round(poreseqcpp.swalign(pa.sequence,refseq)[0],1)) + "%\n") pa.Mutate(reps=reps) if verbose>0: acc = poreseqcpp.swalign(pa.sequence,refseq)[0] sys.stderr.write("Accuracy: " + str(round(acc,1)) + "%\n") for i in range(reps): pa.Mutate(seqs='viterbi') nbases = pa.Refine() if verbose>0: acc = poreseqcpp.swalign(pa.sequence,refseq)[0] sys.stderr.write("Accuracy: " + str(round(acc,1)) + "%\n") if nbases == 0: break # trim ends as requested if 'end_trim' in params and len(pa.sequence) > 2*params['end_trim']: pa.sequence = pa.sequence[int(params['end_trim']):-int(params['end_trim'])] # find the aligned sequence stats acc,inds = poreseqcpp.swalign(pa.sequence,refseq) if verbose>0: errs = np.sum(np.array(inds)==0,0) sys.stderr.write("Final accuracy: " + str(round(acc,1)) + "%\n") sys.stderr.write("Insertions: {}, Deletions: {}\n".format(errs[0],errs[1])) sys.stderr.write("Final coverage: " + str(round(np.mean(pa.Coverage()),1)) + "X\n") return (pa.sequence, acc)
def Mutate(fastafile, bamfile, fast5dir, region=None, params={}, verbose=0, test=False, reps=4): """Run consensus-calling mutations given required info. This function is the main one called by the setuptools entry points for consensus calling, that is, main() -> consensus() -> Mutate(...) -> PSAlign.Mutate(...) Use this function if you want to basically run poreseq consensus from within Python. It takes care of loading events and mutating in different ways, and if the test flag is specified, it starts with a low-accuracy 2D sequence instead of the ref. Args: fastafile (string): reference fasta file (alignment reference) bamfile (string): BAM-formatted file of alignments to reference fast5dir (string): folder where fast5 files are contained region (string): region string, or None for whole reference params (param dict): parameters to use for loading verbose (int): 0 for silent, 1 for steps, 2 for full mutations test (boolean): start with low-accuracy 2D sequence? reps (int): number of iterations for mutation and refinement Returns: tuple(sequence (string), acc (float)): mutated higher-accuracy consensus sequence along with its accuracy relative to the reference """ if 'verbose' not in params: params['verbose'] = 0 pa = LoadAlignedEvents(fastafile, bamfile, fast5dir, RegionInfo(region), params) # and the loaded reference sequence refseq = pa.sequence # test automatically sets verbose if test and verbose == 0: verbose = 1 # we know our algorithm doesn't do great for 1 or 2 events # in which case we can just shortcut and return the starting seq if len(pa.events) < 5: if verbose > 0: sys.stderr.write("Coverage is 1 or 2, not mutating...\n") return (refseq, 100) if verbose > 0: sys.stderr.write("Mutating {} bases using {} events\n".format( len(refseq), len(pa.events))) # if test mode, pick a sequence from event sequences if test: seq = "" for ev in pa.events: pairs = poreseqcpp.swalign(ev.sequence, refseq)[1] if pairs[-1][1] - pairs[0][1] > len(seq): seq = ev.sequence[pairs[0][0]:pairs[-1][0]] pa.sequence = seq if test: sys.stderr.write( "Starting accuracy: " + str(round(poreseqcpp.swalign(pa.sequence, refseq)[0], 1)) + "%\n") pa.Mutate(reps=reps) if verbose > 0: acc = poreseqcpp.swalign(pa.sequence, refseq)[0] sys.stderr.write("Accuracy: " + str(round(acc, 1)) + "%\n") for i in range(reps): pa.Mutate(seqs='viterbi') nbases = pa.Refine() if verbose > 0: acc = poreseqcpp.swalign(pa.sequence, refseq)[0] sys.stderr.write("Accuracy: " + str(round(acc, 1)) + "%\n") if nbases == 0: break # trim ends as requested if 'end_trim' in params and len(pa.sequence) > 2 * params['end_trim']: pa.sequence = pa.sequence[int(params['end_trim'] ):-int(params['end_trim'])] # find the aligned sequence stats acc, inds = poreseqcpp.swalign(pa.sequence, refseq) if verbose > 0: errs = np.sum(np.array(inds) == 0, 0) sys.stderr.write("Final accuracy: " + str(round(acc, 1)) + "%\n") sys.stderr.write("Insertions: {}, Deletions: {}\n".format( errs[0], errs[1])) sys.stderr.write("Final coverage: " + str(round(np.mean(pa.Coverage()), 1)) + "X\n") return (pa.sequence, acc)