def process_self_align_into_seed(align_filename, seqids, reader_class, pCS=None, dun_use_partial=False): """ Ignore hits that are - strand or qID >= sID (self hit or already reported) Returns: pCS -- preClusterSet orphans --- seqs that are neither in pCS nor tucked i.e. no align hits """ if pCS is None: pCS = preClusterSet2() orphans = set(seqids) reader = reader_class(align_filename) for r in reader: if r.qID >= r.sID or r.strand == '-': continue s = r.characterize(30, 0.01, 30, 0.01, 30, 0.05, min_identity=0.99) if dun_use_partial and s == 'partial': continue if s == 'match': pCS.add_seqid_match(r.qID, r.sID) if r.qID in bug_ids or r.sID in bug_ids: print "match:", r.qID, r.sID print "after match:", pCS.seq_stat[r.qID], pCS.seq_stat[r.sID] elif s == 'partial': pCS.add_seqid_partial(r.qID, r.sID) if r.qID in bug_ids or r.sID in bug_ids: print "partial:", r.qID, r.sID print "after partial:", pCS.seq_stat[r.qID], pCS.seq_stat[ r.sID] elif s == 'q_contained': if r.qID in bug_ids or r.sID in bug_ids: print "tucking {0} into {1}".format(r.qID, r.sID) print "before:", pCS.seq_stat[r.qID], pCS.seq_stat[r.sID] pCS.add_seqid_contained(r.qID, r.sID) if r.qID in bug_ids or r.sID in bug_ids: print "after:", pCS.seq_stat[r.qID], pCS.seq_stat[r.sID] elif s == 's_contained': if r.qID in bug_ids or r.sID in bug_ids: print "tucking {0} into {1}".format(r.sID, r.qID) print "before:", pCS.seq_stat[r.sID], pCS.seq_stat[r.qID] pCS.add_seqid_contained(r.sID, r.qID) if r.qID in bug_ids or r.sID in bug_ids: print "after:", pCS.seq_stat[r.sID], pCS.seq_stat[r.qID] try: orphans.remove(r.qID) except: pass try: orphans.remove(r.sID) except: pass #sanity_checking(pCS, orphans) #sanity_checking(pCS, orphans) return pCS, orphans
def process_self_align_into_seed(align_filename, seqids, reader_class, pCS=None, dun_use_partial=False): """ Ignore hits that are - strand or qID >= sID (self hit or already reported) Returns: pCS -- preClusterSet orphans --- seqs that are neither in pCS nor tucked i.e. no align hits """ if pCS is None: pCS = preClusterSet2() orphans = set(seqids) reader = reader_class(align_filename) for r in reader: if r.qID >= r.sID or r.strand == '-': continue s = r.characterize(100, 0.05, 50, 0.02, 20, 0.01) if dun_use_partial and s == 'partial': continue #if r.qID=='m54119_170322_155415/12845906/28_5778_CCS' or r.sID=='m54119_170322_155415/12845906/28_5778_CCS': # pdb.set_trace() # Liz note: currently, just add all to match because minimap sensitivity not enough to do "tuck" properly if s == 'match' or s == 'partial' or s.endswith('_contained'): pCS.add_seqid_match(r.qID, r.sID) #elif s == 'q_contained': # pCS.add_seqid_contained(r.qID, r.sID) #elif s == 's_contained': # pCS.add_seqid_contained(r.sID, r.qID) try: orphans.remove(r.qID) except: pass try: orphans.remove(r.sID) except: pass #sanity_checking(pCS, orphans) #sanity_checking(pCS, orphans) return pCS, orphans
def read_seq_csv(csv_filename): # sanity check that "seqid" and "stat" are two valid column headers header_checked = False orphans = set() pCS = preClusterSet2() for r in DictReader(open(csv_filename), delimiter=','): if not header_checked: if 'seqid' not in r or 'stat' not in r: print("{0} must have the fields 'seqid' and 'stat'! Abort".format(csv_filename), file=sys.stderr) sys.exit(-1) header_checked = True if r['stat']=='orphan': orphans.add(r['seqid']) else: cid = int(r['stat']) if cid not in pCS.S: pCS.S[cid] = preCluster(cid=cid) pCS.add_seqid_to_cluster_by_cid(r['seqid'], cid) return pCS, orphans
def process_self_align_into_seed(align_filename, seqids, reader_class, pCS=None): """ Ignore hits that are - strand or qID >= sID (self hit or already reported) Returns: pCS -- preClusterSet orphans --- seqs that are neither in pCS nor tucked i.e. no align hits """ if pCS is None: pCS = preClusterSet2() orphans = set(seqids) reader = reader_class(align_filename) for r in reader: if r.qID >= r.sID or r.strand == '-': continue s = r.characterize(400, 0.1, 400, 0.1, 100, 0.1) if s == 'partial': continue #if r.qID=='m54119_170322_155415/12845906/28_5778_CCS' or r.sID=='m54119_170322_155415/12845906/28_5778_CCS': # pdb.set_trace() if s == 'match': pCS.add_seqid_match(r.qID, r.sID) elif s == 'q_contained': pCS.add_seqid_contained(r.qID, r.sID) elif s == 's_contained': pCS.add_seqid_contained(r.sID, r.qID) try: orphans.remove(r.qID) except: pass try: orphans.remove(r.sID) except: pass #sanity_checking(pCS, orphans) #sanity_checking(pCS, orphans) return pCS, orphans
def process_self_align_into_seed(align_filename, seqids, reader_class, pCS=None, dun_use_partial=False): """ Ignore hits that are - strand or qID >= sID (self hit or already reported) Returns: pCS -- preClusterSet orphans --- seqs that are neither in pCS nor tucked i.e. no align hits """ if pCS is None: pCS = preClusterSet2() orphans = set(seqids) reader = reader_class(align_filename) for r in reader: if r.qID >= r.sID or r.strand == '-': continue s = r.characterize(400, 0.4, 100, 0.1, 50, 0.05) if dun_use_partial and s == 'partial': continue #if r.qID=='m54119_170322_155415/12845906/28_5778_CCS' or r.sID=='m54119_170322_155415/12845906/28_5778_CCS': # pdb.set_trace() # Liz note: currently, just add all to match because minimap sensitivity not enough to do "tuck" properly if s == 'match' or s == 'partial' or s.endswith('_contained'): pCS.add_seqid_match(r.qID, r.sID) #elif s == 'q_contained': # pCS.add_seqid_contained(r.qID, r.sID) #elif s == 's_contained': # pCS.add_seqid_contained(r.sID, r.qID) try: orphans.remove(r.qID) except: pass try: orphans.remove(r.sID) except: pass #sanity_checking(pCS, orphans) #sanity_checking(pCS, orphans) return pCS, orphans