def merge_allele_frequencies(data_folder, adaID, fragments, VERBOSE=0): '''Merge allele frequencies at overlapping pairs''' import warnings import numpy as np consensi = {frag: SeqIO.read(get_consensus_filename(data_folder, adaID, frag, trim_primers=True), 'fasta') for frag in fragments} nus = {frag: np.load(get_allele_frequencies_filename(data_folder, adaID, frag)) for frag in fragments} pairs = get_overlapping_fragments(fragments) overlaps = {} for (frag1, frag2) in pairs: overlap = get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=VERBOSE) is_diff = check_overlap_consensus(data_folder, adaID, frag1, frag2, overlap, VERBOSE=VERBOSE) if is_diff: warnings.warn(frag1+' and '+frag2+' have different consensi.', RuntimeWarning) overlaps[(frag1, frag2)] = overlap nu = [] fragments = sorted(fragments) for i, frag in enumerate(fragments): # If the start is not an overlap, start a new chunk and copy all if (i == 0) or (fragments[i-1], frag) not in overlaps: nuf = [[frag], nus[frag]] nu.append(nuf) # else, copy from the end of the overlap on # FIXME: we could average the consensus zone out of indels... else: nuf = nu[-1] nuf[0].append(frag) tmp = overlaps[(fragments[i-1], frag)] if tmp is not None: (_, start, _) = tmp #(recursion is not the most efficient but -- oh, well) nuf[1] = np.concatenate([nuf[1], nus[frag][:, start:]], axis=1) else: tmp = np.zeros((nuf[1].shape[0], 10), float) tmp[-1] = 1 nuf[1] = np.concatenate([nuf[1], tmp, nus[frag][:, start:]], axis=1) return nu
def merge_consensi(data_folder, adaID, fragments, VERBOSE=0): '''Merge consensi at overlapping pairs''' import warnings consensi = {frag: SeqIO.read(get_consensus_filename(data_folder, adaID, frag, trim_primers=True), 'fasta') for frag in fragments} pairs = get_overlapping_fragments(fragments) overlaps = {} for (frag1, frag2) in pairs: overlap = get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=VERBOSE) is_diff = check_overlap_consensus(data_folder, adaID, frag1, frag2, overlap, VERBOSE=VERBOSE) if is_diff: warnings.warn(frag1+' and '+frag2+' have different consensi.', RuntimeWarning) overlaps[(frag1, frag2)] = overlap consensus = [] fragments = sorted(fragments) for i, frag in enumerate(fragments): # If the start is not an overlap, start a new consensus and copy all if (i == 0) or (fragments[i-1], frag) not in overlaps: cons = [[frag], str(consensi[frag].seq)] consensus.append(cons) # copy from the end of the overlap on else: cons = consensus[-1] cons[0].append(frag) tmp = overlaps[(fragments[i-1], frag)] if tmp is not None: (_, start, _) = tmp cons[1] = cons[1]+str(consensi[frag][start:].seq) else: cons[1] = cons[1]+('N' * 10)+str(consensi[frag].seq) # Make SeqRecords out of consensi for i, (frags, cons) in enumerate(consensus): name = 'adaID_'+str(adaID)+'_'+'-'.join(frags) rec = SeqRecord(Seq(cons, IUPAC.ambiguous_dna), id=name, name=name) consensus[i] = (frags, rec) return consensus