def _merge_scaffolds(big_scaffolds, small_scaffolds): """ Performs the final merging step """ count_diff_scaf = 0 count_diff_orient = 0 count_inconsistent = 0 total_success = 0 total_fail = 0 total_inserted = 0 not_found = 0 big_count = defaultdict(int) for scf in big_scaffolds: for c in scf.contigs: big_count[c.perm] += 1 small_count = defaultdict(int) for scf in small_scaffolds: for c in scf.contigs: small_count[c.perm] += 1 repeats = set(seq for ( seq, count) in chain(list(big_count.items()), list(small_count.items())) if count > 1) big_unique = set(seq for (seq, count) in big_count.items() if count == 1) small_index = {} for scf in small_scaffolds: for pos, contig in enumerate(scf.contigs): if contig.perm not in repeats: assert contig.perm not in small_index small_index[contig.perm] = (scf, pos) new_scafflods = [] for big_scf in big_scaffolds: new_contigs = [] #non_repeats = list(filter(lambda i: big_scf.contigs[i].perm # not in repeats, # xrange(len(big_scf.contigs)))) non_repeats = [ i for i in range(len(big_scf.contigs)) if big_scf.contigs[i].perm not in repeats ] for left_idx, right_idx in zip(non_repeats[:-1], non_repeats[1:]): left_cnt = big_scf.contigs[left_idx] right_cnt = big_scf.contigs[right_idx] consistent = False weak_contigs = None link_to_change = None if (left_cnt.perm in small_index and right_cnt.perm in small_index): consistent = True left_scf, left_pos = small_index[left_cnt.perm] right_scf, right_pos = small_index[right_cnt.perm] big_sign = left_cnt.sign == right_cnt.sign small_sign = (left_scf.contigs[left_pos].sign == right_scf.contigs[right_pos].sign) if left_scf != right_scf: count_diff_scaf += 1 consistent = False elif big_sign != small_sign: count_diff_orient += 1 consistent = False else: same_dir = left_pos < right_pos if not same_dir: left_pos, right_pos = right_pos, left_pos weak_contigs = left_scf.contigs[left_pos + 1:right_pos] if any(c.perm in big_unique for c in weak_contigs): count_inconsistent += 1 consistent = False link_to_change = copy(left_scf.contigs[left_pos].link) #reverse complement if weak_contigs and not same_dir: link_to_change = copy(left_scf.contigs[right_pos - 1].link) weak_contigs = [ c.reverse_copy() for c in weak_contigs[::-1] ] for pw, nw in zip(weak_contigs[:-1], weak_contigs[1:]): pw.link = copy(nw.link) weak_contigs[-1].link = copy( left_scf.contigs[left_pos].link) else: not_found += 1 new_contigs.append(left_cnt) if consistent and weak_contigs: new_contigs[-1].link = link_to_change new_contigs.extend(weak_contigs) total_success += 1 total_inserted += len(weak_contigs) #logger.debug("Inserting '{0}' between {1} and {2}" # .format(map(lambda c: c.perm, weak_contigs), # left_cnt, right_cnt)) else: new_contigs.extend(big_scf.contigs[left_idx + 1:right_idx]) total_fail += 1 if len(new_contigs) > 1: new_contigs.append(right_cnt) s = Scaffold(big_scf.name) s.contigs = new_contigs new_scafflods.append(s) else: #because of repeats new_scafflods.append(big_scf) logger.debug("Fail: not found: %d", not_found) logger.debug("Fail: different scaffolds: %d", count_diff_scaf) logger.debug("Fail: different orientatilns: %d", count_diff_orient) logger.debug("Fail: inconsistent: %d", count_inconsistent) logger.debug("Total success: %d", total_success) logger.debug("Total fail: %d", total_fail) logger.debug("Total inserted: %d", total_inserted) num_contigs = 0 for scf in new_scafflods: num_contigs += len(scf.contigs) logger.debug("Result: %d contigs in %d scaffolds", num_contigs, len(new_scafflods)) return new_scafflods
def _merge_scaffolds(big_scaffolds, small_scaffolds): """ Performs the final merging step """ count_diff_scaf = 0 count_diff_orient = 0 count_inconsistent = 0 total_success = 0 total_fail = 0 total_inserted = 0 not_found = 0 big_count = defaultdict(int) for scf in big_scaffolds: for c in scf.contigs: big_count[c.perm] += 1 small_count = defaultdict(int) for scf in small_scaffolds: for c in scf.contigs: small_count[c.perm] += 1 repeats = set(seq for (seq, count) in chain(big_count.items(), small_count.items()) if count > 1) big_unique = set(seq for (seq, count) in big_count.items() if count == 1) small_index = {} for scf in small_scaffolds: for pos, contig in enumerate(scf.contigs): if contig.perm not in repeats: assert contig.perm not in small_index small_index[contig.perm] = (scf, pos) new_scafflods = [] for big_scf in big_scaffolds: new_contigs = [] non_repeats = list(filter(lambda i: big_scf.contigs[i].perm not in repeats, xrange(len(big_scf.contigs)))) for left_idx, right_idx in zip(non_repeats[:-1], non_repeats[1:]): left_cnt = big_scf.contigs[left_idx] right_cnt = big_scf.contigs[right_idx] consistent = False if (left_cnt.perm in small_index and right_cnt.perm in small_index): consistent = True left_scf, left_pos = small_index[left_cnt.perm] right_scf, right_pos = small_index[right_cnt.perm] big_sign = left_cnt.sign == right_cnt.sign small_sign = (left_scf.contigs[left_pos].sign == right_scf.contigs[right_pos].sign) if left_scf != right_scf: count_diff_scaf += 1 consistent = False elif big_sign != small_sign: count_diff_orient += 1 consistent = False else: same_dir = left_pos < right_pos if not same_dir: left_pos, right_pos = right_pos, left_pos weak_contigs = left_scf.contigs[left_pos + 1 : right_pos] if any(c.perm in big_unique for c in weak_contigs): count_inconsistent += 1 consistent = False if not same_dir: weak_contigs = list(map(lambda c: c.reverse_copy(), weak_contigs[::-1])) link_to_change = left_scf.contigs[left_pos].link else: not_found += 1 new_contigs.append(left_cnt) if consistent: new_contigs[-1].link = link_to_change new_contigs.extend(weak_contigs) total_success += 1 total_inserted += len(weak_contigs) #logger.debug("Inserting '{0}' between {1} and {2}" # .format(map(lambda c: c.perm, weak_contigs), # left_cnt, right_cnt)) else: new_contigs.extend(big_scf.contigs[left_idx+1:right_idx]) total_fail += 1 if len(new_contigs) > 1: new_contigs.append(right_cnt) s = Scaffold(big_scf.name) s.contigs = new_contigs new_scafflods.append(s) else: #because of repeats new_scafflods.append(big_scf) logger.debug("Fail: not found: {0}".format(not_found)) logger.debug("Fail: different scaffolds: {0}".format(count_diff_scaf)) logger.debug("Fail: different orientatilns: {0}".format(count_diff_orient)) logger.debug("Fail: inconsistent: {0}".format(count_inconsistent)) logger.debug("Total success: {0}".format(total_success)) logger.debug("Total fail: {0}".format(total_fail)) logger.debug("Total inserted: {0}".format(total_inserted)) num_contigs = 0 for scf in new_scafflods: num_contigs += len(scf.contigs) logger.debug("Result: {0} contigs in {1} scaffolds" .format(num_contigs, len(new_scafflods))) return new_scafflods
def merge(big_scaffolds, small_scaffolds): """ The only function here """ logger.info("Merging two iterations") big_index = set() for scf in big_scaffolds: for c in scf.contigs: big_index.add(c.name) small_index = {} for scf in small_scaffolds: for pos, contig in enumerate(scf.contigs): assert contig.name not in small_index small_index[contig.name] = (scf, pos) count = 0 new_scafflods = [] for scf in big_scaffolds: result = [] for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]): result.append(prev_cont) try: scf_prev, begin = small_index[prev_cont.name] scf_new, end = small_index[new_cont.name] except KeyError: continue if scf_prev.name != scf_new.name: continue assert end != begin same_dir = True if end < begin: same_dir = False end, begin = begin, end consistent = True for c in scf_prev.contigs[begin + 1 : end]: if c.name in big_index: consistent = False break if not consistent or end - begin == 1: continue if ((prev_cont.sign == new_cont.sign) != (scf_prev.contigs[begin].sign == scf_prev.contigs[end].sign)): continue count += end - begin - 1 contigs = scf_prev.contigs[begin + 1 : end] if not same_dir: contigs = contigs[::-1] contigs = list(map(lambda c: c.reverse(), contigs)) #keeping gap from new contigs result[-1].link = scf_prev.contigs[begin].link result.extend(contigs) result.append(new_cont) s = Scaffold(scf.name) s.contigs = result new_scafflods.append(s) return new_scafflods
def merge(big_scaffolds, small_scaffolds): logger.info("Merging two iterations") big_index = set() for scf in big_scaffolds: for c in scf.contigs: big_index.add(c.name) small_index = {} for scf in small_scaffolds: for pos, contig in enumerate(scf.contigs): assert contig.name not in small_index small_index[contig.name] = (scf, pos) count = 0 new_scafflods = [] for scf in big_scaffolds: result = [] for prev_cont, new_cont in zip(scf.contigs[:-1], scf.contigs[1:]): result.append(prev_cont) try: scf_prev, begin = small_index[prev_cont.name] scf_new, end = small_index[new_cont.name] except KeyError: continue if scf_prev.name != scf_new.name: continue assert end != begin same_dir = True if end < begin: same_dir = False end, begin = begin, end consistent = True for c in scf_prev.contigs[begin + 1:end]: if c.name in big_index: consistent = False break if not consistent or end - begin == 1: continue if ((prev_cont.sign == new_cont.sign) != (scf_prev.contigs[begin].sign == scf_prev.contigs[end].sign)): continue count += end - begin - 1 contigs = scf_prev.contigs[begin + 1:end] if not same_dir: contigs = contigs[::-1] contigs = list( map(lambda c: Contig(c.name, -c.sign, 0), contigs)) result.extend(contigs) result.append(new_cont) s = Scaffold(scf.name) s.contigs = result new_scafflods.append(s) return new_scafflods