def main_gradient_2_sans_proj(u_p, v_p, Y, u_, v_, N, M, lambda_, beta_u, beta_v, lambda_1, lambda_2, dt): for i in range(iteration): # Computation sqrt_dt = torch.sqrt(dt) u_1 = (1 / lambda_1) * f.gradient_u_2(N, M, u_p, v_p, Y, lambda_) * dt u_2 = torch.sqrt(2 / (lambda_1 * beta_u)) * torch.empty(N).normal_( mean=0, std=sqrt_dt) u_3 = ((N - 1) / (N * lambda_1 * beta_u)) * u_p * dt u_n = u_p - u_1 + u_2 - u_3 v_1 = 1 / lambda_2 * f.gradient_v_2(N, M, u_p, v_p, Y, lambda_) * dt v_2 = torch.sqrt(2 / (lambda_2 * beta_v)) * torch.empty(M).normal_( mean=0, std=sqrt_dt) v_3 = ((M - 1) / (M * lambda_2 * beta_v)) * v_p * dt v_n = v_p - v_1 + v_2 - v_3 # Re-asign for the loop u_p = u_n v_p = v_n res_u = f.overlap(u_, u_n, N) res_v = f.overlap(v_, v_n, M) print("g2_u: ", res_u) print("g2_v: ", res_v) return (res_u, res_v)
def main_gradient_1_avec_proj(u_p, v_p, Y, u_, v_, N, M, lambda_, beta_u, beta_v, lambda_1, lambda_2, dt): for i in range(iteration): # Computation u_1 = (1 / lambda_1) * torch.tensordot( f.proj(u_p, N), f.gradient_u_1(N, M, u_p, v_p, Y, lambda_), 1) * dt u_2 = torch.sqrt(2 / (lambda_1 * beta_u)) * torch.tensordot( f.proj(u_p, N), torch.empty(N).normal_(mean=0, std=torch.sqrt(dt)), 1) u_3 = ((N - 1) / (N * lambda_1 * beta_u)) * u_p * dt u_n = u_p - u_1 + u_2 - u_3 v_1 = (1 / lambda_2) * torch.tensordot( f.proj(v_p, M), f.gradient_v_1(N, M, u_p, v_p, Y, lambda_), 1) * dt v_2 = torch.sqrt(2 / (lambda_2 * beta_v)) * torch.tensordot( f.proj(v_p, M), torch.empty(M).normal_(mean=0, std=torch.sqrt(dt)), 1) v_3 = ((M - 1) / (M * lambda_2 * beta_v)) * v_p * dt v_n = v_p - v_1 + v_2 - v_3 # Re-asign for the loop u_p = u_n v_p = v_n res_u = f.overlap(u_, u_n, N) res_v = f.overlap(v_, v_n, M) print("g1_u_proj: ", res_u) print("g1_v_proj: ", res_v) return (res_u, res_v)
def cmp_run_overlaps(PRa, ka, PRb, kb): clusts_a = PRa.hit_clusters[(ka['id_a'], ka['id_b'], ka['linkage_type'], ka['alpha'], ka['cut'], ka['nd'])] clusts_b = PRb.hit_clusters[(kb['id_a'], kb['id_b'], kb['linkage_type'], kb['alpha'], kb['cut'], kb['nd'])] chr_groups_a = {} total_coverage_a = 0 total_coverage_b = 0 overlap_a = 0 overlap_b = 0 for c in clusts_a: k = (str(c[0]), str(c[3])) if k not in chr_groups_a: chr_groups_a[k] = [] #fi chr_groups_a[k].append(c) total_coverage_a = total_coverage_a + (c[2] - c[1]) + (c[5] - c[4]) #efor for c in clusts_b: k = (str(c[0]), str(c[3])) total_coverage_b = total_coverage_b + (c[2] - c[1]) + (c[5] - c[4]) if k not in chr_groups_a: continue #fi rel_clusts = chr_groups_a[k] for rc in rel_clusts: ov_a = util.overlap((c[1], c[2]), (rc[1], rc[2])) ov_b = util.overlap((c[4], c[5]), (rc[4], rc[5])) if ov_a > 0 and ov_b > 0: print k print(c[1], c[2]), (rc[1], rc[2]), "->", ov_a print(c[4], c[5]), (rc[4], rc[5]), "->", ov_b overlap_a = overlap_a + ov_a overlap_b = overlap_b + ov_b #fi #efor print overlap_a, overlap_b, total_coverage_a, total_coverage_b return ((float(overlap_a + overlap_b) / float(total_coverage_a + total_coverage_b)), float(overlap_a) / float(total_coverage_a), float(overlap_b) / float(total_coverage_b))
def newAnnotation(request): document_id=int(request.POST['document_id']) text=request.POST['newNec'] annotation_id=int(request.POST['newNecCategoryId']) if annotation_id == 'Delete': return documentByAnnotator(request, document_id, annotator_id=request.user.id, error=None) if not text: return documentByAnnotator(request, document_id, annotator_id=request.user.id, error="Please select text to add new anntations") annotator=Annotator.objects.get(id=request.user.id) annotation_type=AnnotationType.objects.get(id=int(annotation_id)) document=Document.objects.get(id=document_id) annotations = Annotation.objects.filter(document=document, annotator=annotator) indices=util.findIndices(document.text, text) for ind in indices: if Annotation.objects.filter(document=document, begin_index=ind[0], end_index=ind[1], annotator=annotator): continue begin=ind[0] end=ind[1] allNamedEntities=Annotation.objects.filter(document=document, annotator=annotator) for absNE in allNamedEntities: if util.overlap((absNE.begin_index, absNE.end_index), (ind[0],ind[1])): absNE.delete() #add new one annotation = Annotation(document=document, annotation=text, begin_index=begin, end_index=end, annotation_type=annotation_type, annotator=annotator) annotation.save() return documentByAnnotator(request, document_id, annotator_id=request.user.id, error=None)
def run_cba(Xtr, Ytr, Xt, Yt, lb, support=0.20, confidence=0.5, k=None, log=None): txns_train = TransactionDB.from_DataFrame(pd.concat([Xtr, Ytr], axis=1)) txns_test = TransactionDB.from_DataFrame(pd.concat([Xt, Yt], axis=1)) cba = CBA(support=support, confidence=confidence, algorithm="m1") cba.fit(txns_train) if k is not None: cba.clf.rules = cba.clf.rules[:k] Y_pred = [int(i) for i in cba.predict(txns_test)] for r in cba.clf.rules: r.covered = set( [i for i, rd in enumerate(txns_train) if r.antecedent <= rd]) if log is None: from logger import log log('cba-k', len(cba.clf.rules)) log('cba-rules', str(cba.clf.rules)) [log('cba-nconds', len(r), i) for i, r in enumerate(cba.clf.rules)] log('cba-auc', roc_auc_score(lb.transform(Yt.values), lb.transform(Y_pred))) log('cba-bacc', balanced_accuracy_score(Yt, Y_pred)) log('cba-disp', dispersion_(cba.clf.rules, average=True)) log('cba-overlap', overlap(cba.clf.rules)) print(confusion_matrix(Yt, Y_pred))
def run_ours(Xtr, Ytr, Xt, Yt, lb, nsample, lambda_mode, q, sample_mode, k=None, rerun=True, eps=0.01, min_recall_per_class=0.8, log=None): #name = 'ours' if k is None else 'oursk' name = 'ours{}'.format(int(rerun)) k = k if k is not None else 100 dec = DecisionSet(eps) dec.train(Xtr, Ytr, max_k=k, nsamp=nsample, lamb=lambda_mode, q=q, mode=sample_mode, rerun=rerun, min_recall_per_class=min_recall_per_class) print('default:', dec.default) Xt_ = [Transaction(feat2item(t)) for t in Xt.values] Y_pred = dec.predict_all(Xt_) if log is None: from logger import log log('{}-default'.format(name), dec.default) log('{}-k'.format(name), len(dec.rules)) log('{}-maxk'.format(name), k) [log('{}-nconds'.format(name), len(r), i) for i, r in enumerate(dec.rules)] log('{}-q'.format(name), q) log('{}-nsample'.format(name), nsample) log('{}-lamb'.format(name), lambda_mode) log('{}-seq'.format(name), dec.seq) log('{}-auc'.format(name), roc_auc_score(lb.transform(Yt.values), lb.transform(Y_pred))) log('{}-bacc'.format(name), balanced_accuracy_score(Yt, Y_pred)) log('{}-disp'.format(name), dispersion(dec.rules, average=True)) log('{}-overlap'.format(name), overlap(dec.rules)) log('{}-mode'.format(name), sample_mode) [ log('{}-precisions-tr'.format(name), v, l) for l, v in precision(dec).items() ] [ log('{}-recall-tr'.format(name), v, l) for l, v in recall(dec.rules).items() ] print(confusion_matrix(Yt, Y_pred)) return Y_pred
def run_ids(Xtr, Ytr, Xt, Yt, lb, min_freq, lambs, log=None): ids, nfreq, default = IDS(Xtr, Ytr.values, lambs, freq=min_freq) for r in ids: print('class: ', r.class_label, ', cover: {}/{}'.format(len(r.get_correct_cover(Xtr, Ytr)), len(r.get_cover(Xtr))), end='; ') r.print_rule() for r in ids: r.covered = set(r.get_cover(Xtr)) Y_pred = IDS_predict(ids, Xt, default=default) if log is None: from logger import log [log('ids-lambda', lamb, i) for i, lamb in enumerate(lambs)] log('ids-k', len(ids)) [log('ids-nconds', r.get_length(), i) for i, r in enumerate(ids)] log('ids-nfreq', nfreq) log('ids-freq', min_freq) log('ids-default', default) log('ids-auc', roc_auc_score(lb.transform(Yt.values), lb.transform(Y_pred))) log('ids-bacc', balanced_accuracy_score(Yt, Y_pred)) log('ids-disp', dispersion_(ids, average=True)) log('ids-overlap', overlap(ids)) print(confusion_matrix(Yt, Y_pred)) return Y_pred
def findOverlapIntervals(name1, name2, cutoffRatio): nodeNamePairs = [] interval1Start = int(name1[1]) interval1End = int(name1[2]) interval2Start = int(name2[1]) interval2End = int(name2[2]) overlap = util.overlap(interval1Start, interval1End, interval2Start, interval2End) intervalLen1 = interval1End - interval1Start intervalLen2 = interval2End - interval2Start overlapRatio1 = float(overlap) / float(intervalLen1) overlapRatio2 = float(overlap) / float(intervalLen2) maxOverlapRatio = max(overlapRatio1, overlapRatio2) if (name1[0] != name2[0]): print "Error!!! ", name1, name2 #add the nodeNamePair to nodeNamePairs if maxOverlapRatio > cutoffRatio: #print name1,name2 nodeNamePairs.append((name1, name2)) return nodeNamePairs
def containsOverlapBorders(borders): for i in range(len(borders)): for j in range(i+1,len(borders)): if (borders[i][0]==borders[j][0]): overlapped=util.overlap(borders[i][1],borders[i][2],borders[j][1],borders[j][2])>0 if overlapped: return (i,j) return False
def viewSortDet(gtImages,detlist,numim=numpy.inf,opt="all",usetr=True,usedf=False,ovr=0.5): dimg={} tot=0 for idx in range(min(gtImages.getTotal(),numim)): rect=gtImages.getBBox(idx) if rect!=[]: #print gtImages.getImageName(idx).split("/")[-1].split(".")[0] dimg[gtImages.getImageName(idx).split("/")[-1].split(".")[0]]=rect tot=tot+len(rect) imname=[] cnt=0 tp=numpy.zeros(len(detlist)) fp=numpy.zeros(len(detlist)) thr=numpy.zeros(len(detlist)) detlist.sort(cmpscore) for idx,detbb in enumerate(detlist): #print detbb[1] found=False if dimg.has_key(detbb[0]): rect=dimg[detbb[0]] found=False for r in rect: rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4])) #print "GT:",r #print "DET:",rb if overlap(rb,r)>=ovr: dimg[detbb[0]].remove(r) found=True break if found: tp[idx]=1 else: fp[idx]=1 thr[idx]=detbb[1] if show: pylab.ioff() prec=numpy.sum(tp)/float(numpy.sum(tp)+numpy.sum(fp)) rec=numpy.sum(tp)/tot print "Scr:",detbb[1],"Prec:",prec,"Rec:",rec img=gtImages.getImageByName2(detbb[0]) pylab.figure(1) pylab.clf() pylab.imshow(img) rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4])) for r in rect: pylab.figure(1) pylab.ioff() box(r[0],r[1],r[2],r[3],'b',lw=1.5) if found: box(rb[0],rb[1],rb[2],rb[3],'g',lw=1.5) else: box(rb[0],rb[1],rb[2],rb[3],'r',lw=1.5) pylab.draw() pylab.show() rect=[] raw_input() return tp,fp,thr,tot
def autoScafMidSeam(self, strands): """docstring for autoScafMidSeam""" part = self.part() strandType = StrandType.Scaffold idx = part.activeBaseIndex() for i in range(1, len(strands)): row1, col1, sSidx1 = strands[i-1] # previous strand row2, col2, sSidx2 = strands[i] # current strand vh1 = part.virtualHelixAtCoord((row1, col1)) vh2 = part.virtualHelixAtCoord((row2, col2)) strand1 = vh1.scaffoldStrandSet()._strandList[sSidx1] strand2 = vh2.scaffoldStrandSet()._strandList[sSidx2] # determine if the pair of strands are neighbors neighbors = part.getVirtualHelixNeighbors(vh1) if vh2 in neighbors: p2 = neighbors.index(vh2) if vh2.number() % 2 == 1: # resize and install external xovers try: # resize to the nearest prexover on either side of idx newLo = util.nearest(idx, part.getPreXoversHigh(strandType, p2, maxIdx=idx-10)) newHi = util.nearest(idx, part.getPreXoversLow(strandType, p2, minIdx=idx+10)) if strand1.canResizeTo(newLo, newHi) and \ strand2.canResizeTo(newLo, newHi): # do the resize strand1.resize((newLo, newHi)) strand2.resize((newLo, newHi)) # install xovers part.createXover(strand1, newHi, strand2, newHi) part.createXover(strand2, newLo, strand1, newLo) except ValueError: pass # nearest not found in the expanded list # go back an install the internal xovers if i > 2: row0, col0, sSidx0 = strands[i-2] # two strands back vh0 = part.virtualHelixAtCoord((row0, col0)) strand0 = vh0.scaffoldStrandSet()._strandList[sSidx0] if vh0 in neighbors: p0 = neighbors.index(vh0) l0, h0 = strand0.idxs() l1, h1 = strand1.idxs() oLow, oHigh = util.overlap(l0, h0, l1, h1) try: lList = filter(lambda x:x>oLow and x<oHigh, part.getPreXoversLow(strandType, p0)) lX = lList[len(lList)/2] hList = filter(lambda x:x>oLow and x<oHigh, part.getPreXoversHigh(strandType, p0)) hX = hList[len(hList)/2] # install high xover first part.createXover(strand0, hX, strand1, hX) # install low xover after getting new strands # following the breaks caused by the high xover strand3 = vh0.scaffoldStrandSet()._strandList[sSidx0] strand4 = vh1.scaffoldStrandSet()._strandList[sSidx1] part.createXover(strand4, lX, strand3, lX) except IndexError: pass # filter was unhappy
def distance(hits, i, j): """d = distance(hits, i, j): hits: The output list of hit_index i: The ID of hit i j: The ID of hit j |-a-| ---=========---============--- \\\\\\\\\\ |||||||||||| ----=========--============--- |-b| d = a + b Outputs: d: The distance between hit i and hit j """ h1 = hits[i]; h2 = hits[j]; # Different chromosomes if (h1[1] != h2[1]) or (h1[4] != h2[4]): return float("inf"); #fi # Same exons if (h1[2] == h2[2]) and (h1[5] == h2[5]) and (h1[3] == h2[3]) and (h1[6] == h2[6]): return 0; #fi # Regions of hit h1_a = (h1[7], h1[8]); h1_b = (h1[9], h1[10]); h2_a = (h2[7], h2[8]); h2_b = (h2[9], h2[10]); ov1 = util.overlap(h1_a, h2_a); ov2 = util.overlap(h1_b, h2_b); # If they all overlap, distance is 0. if ov1 > 0 and ov2 > 0: return 0; #fi return -(min(0, ov1) + min(0, ov2));
def VOCprlistfast(gtImages,detlist,show=False,usetr=True,usedf=False,ovr=0.5): """ calculate the precision recall curve """ dimg={} tot=0 for idx in range(gtImages.getTotal()): rect=gtImages.getBBox(idx) if rect!=[]: dimg[gtImages.getImageName(idx).split("/")[-1].split(".")[0]]=rect tot=tot+len(rect) #print tot imname=[] cnt=0 tp=numpy.zeros(len(detlist)) fp=numpy.zeros(len(detlist)) detlist.sort(cmpscore) for idx,detbb in enumerate(detlist):#detlist[sortlist]):#gtImages.getTotal()): found=False if dimg.has_key(detbb[0]): rect=dimg[detbb[0]]#gtImages.getBBox(idx,usetr=usetr,usedf=usedf) #print rect found=False for r in rect: rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4])) if overlap(rb,r)>=ovr: dimg[detbb[0]].remove(r) found=True break if found: tp[idx]=1#.append(float(detbb[1])) else: fp[idx]=1#.append(float(detbb[1])) if show: pylab.ioff() img=gtImages.getImageByName2(detbb[0]) pylab.figure(1) pylab.clf() pylab.imshow(img) rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4])) for r in rect: pylab.figure(1) pylab.ioff() box(r[0],r[1],r[2],r[3],'b',lw=1.5) if found: box(rb[0],rb[1],rb[2],rb[3],'g',lw=1.5) else: box(rb[0],rb[1],rb[2],rb[3],'r',lw=1.5) pylab.draw() pylab.show() rect=[] raw_input() return tp,fp,tot
def run_cn2(Xtr, Ytr, Xt, Yt, lb, k=None, log=None): domainx = Domain.from_numpy(Xtr.values) domainy = Domain.from_numpy(Ytr.values.reshape((-1, 1))) datax = Orange.data.Table.from_numpy(domainx, Xtr.values) datay = Orange.data.Table.from_numpy(domainy, Ytr.values.reshape((-1, 1))) discretizer = Orange.preprocess.DomainDiscretizer() domainx = discretizer(datax) domainy = discretizer(datay) domain = Domain(domainx.attributes, domainy.attributes[0]) data = Orange.data.Table.from_numpy(domain, Xtr.values, Y=Ytr.values) learner = Orange.classification.CN2UnorderedLearner() #learner = Orange.classification.rules.CN2Learner() learner.rule_finder.search_algorithm.beam_width = 10 learner.rule_finder.search_strategy.constrain_continuous = True learner.rule_finder.general_validator.min_covered_examples = 15 cn2 = learner(data) if k is not None: r_def = cn2.rule_list[-1] cn2.rule_list = cn2.rule_list[:k] cn2.rule_list.append(r_def) Y_pred = np.argmax(cn2.predict(Xt.values), axis=1) ids = np.arange(Xtr.shape[0]) print('default:', cn2.rule_list[-1].prediction) # Skip the last default rule for i, r in enumerate(cn2.rule_list[:-1]): cov = np.array([r.evaluate_instance(x) for x in data]) pred = np.array([r.prediction] * sum(cov)) acc = pred == Ytr.values[cov] r.covered = set(ids[cov]) print( 'CN2', '#{}, label:{}, len:{}, cov:{}, acc:{}'.format( i, r.prediction, r.length, sum(cov) / len(ids), sum(acc) / sum(cov))) if log is None: from logger import log log('cn2-k', len(cn2.rule_list[:-1])) [log('cn2-nconds', r.length, i) for i, r in enumerate(cn2.rule_list[:-1])] log('cn2-auc', roc_auc_score(lb.transform(Yt.values), lb.transform(Y_pred))) log('cn2-bacc', balanced_accuracy_score(Yt, Y_pred)) log('cn2-disp', dispersion_(cn2.rule_list[:-1], average=True)) log('cn2-overlap', overlap(cn2.rule_list[:-1])) print(confusion_matrix(Yt, Y_pred))
def newAnnotation(request): #import pdb #pdb.set_trace() document_id=int(request.POST['document_id']) text=request.POST['newNec'] annotation_id=int(request.POST['newNecCategoryId']) if annotation_id == 'Delete': return documentByAnnotator(request, document_id, annotator_id=request.user.id, error=None) if not text: return documentByAnnotator(request, document_id, annotator_id=request.user.id, error="Please select text to add new anntations") annotator=Annotator.objects.get(id=request.user.id) annotation_type=AnnotationType.objects.get(id=int(annotation_id)) document=Document.objects.get(id=document_id) annotations = Annotation.objects.filter(document=document, annotator=annotator) indices=util.findIndices(document.text, text) for ind in indices: if Annotation.objects.filter(document=document, begin_index=ind[0], end_index=ind[1], annotator=annotator): continue begin=ind[0] end=ind[1] allNamedEntities=Annotation.objects.filter(document=document, annotator=annotator) foundOverlap = False for absNE in allNamedEntities: if util.overlap((absNE.begin_index, absNE.end_index), (ind[0],ind[1])): foundOverlap = True #absNE.delete() # blah! # don't erase any existing entities if foundOverlap: continue #add new one annotation = Annotation(document=document, annotation=text, begin_index=begin, end_index=end, annotation_type=annotation_type, annotator=annotator) annotation.save() annotations = Annotation.objects.filter(document=document, annotator=annotator) text=util.htmlFormat(document.text, annotations) #return documentByAnnotator(request, document_id, annotator_id=request.user.id, error=None) return HttpResponse(text)
def overlap_clusters(C): overlaps = [] for i in xrange(len(C) - 1): ci = C[i] cio = [] for j in xrange(i + 1, len(C)): cj = C[j] rlen = float(ci[2] - ci[1] + 1) / float(cj[2] - cj[1] + 1) if ((ci[0] == cj[0]) and util.overlap((ci[1], ci[2]), (cj[1], cj[2])) > 0 and (rlen > 0.8 and rlen < 1.25)): cio.append(j) #fi #efor if len(cio) > 0: overlaps.append(cio + [i]) #fi #efor return overlaps
def VOCanalysis(gtImages, detlist, show=False, usetr=True, usedf=False, ovr=0.5): """ calculate the precision recall curve """ dimg = {} tot = 0 for idx in range(len(gtImages)): rect = gtImages[idx]["bbox"][:] #if idx>288: # print idx,rect if rect != []: #print gtImages.getImageName(idx).split("/")[-1].split(".")[0] dimg[gtImages[idx]["name"].split("/")[-1].split(".")[0]] = { "bbox": rect, "det": [False] * len(rect) } tot = tot + len(rect) imname = [] cnt = 0 tp = numpy.zeros(len(detlist)) fp = numpy.zeros(len(detlist)) thr = numpy.zeros(len(detlist)) tplist = [] fplist = [] fp2list = [] fnlist = [] detlist.sort(cmpscore) for idx, detbb in enumerate(detlist): #print detbb[1] found = False maxovr = 0 #gtdet=[False] gt = 0 if dimg.has_key(detbb[0]): rect = dimg[detbb[0]]["bbox"] found = False for ir, r in enumerate(rect): #gtdet.append(False) rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]), float(detbb[4])) #print "GT:",r #print "DET:",rb covr = overlap(rb, r) if covr >= maxovr: maxovr = covr gt = ir #dimg[detbb[0]].remove(r) #found=True #break if maxovr > ovr: if not (dimg[detbb[0]]["det"][gt]): tp[idx] = 1 dimg[detbb[0]]["det"][gt] = True tplist.append(detbb) else: fp[idx] = 1 fplist.append(detbb) else: fp[idx] = 1 fp2list.append(detbb) totalDetected = 0 totalnoDetected = 0 for idx in range(len(gtImages)): rect = gtImages[idx]["bbox"][:] if rect != []: name = gtImages[idx]["name"].split("/")[-1].split(".")[0] bboxgt = dimg[name] for i in range(len(bboxgt["det"])): if bboxgt["det"][i]: #bbox FOUND, it's ok totalDetected += 1 else: #bbox not FOUND, add to FN gtbb = [name, 0, bboxgt["bbox"][i][0:4]] fnlist.append(gtbb) totalnoDetected += 1 print "total Detected %d, total no Detected %d" % (totalDetected, totalnoDetected) #tplist.sort(key=lambda det: -det[1]) #fplist.sort(key=lambda det: -det[1]) #fnlist.sort(key=lambda det: -det[1]) return tplist, fplist, fp2list, fnlist
def VOCprRecordOptim(gtImages, detlist, show=False, ovr=0.5, pixels=None): """ calculate the precision recall curve """ tx = [] ty = [] sx = [] sy = [] dimg = {} tot = 0 for idx in range(len(gtImages)): rect = gtImages[idx]["bbox"][:] if rect != []: dimg[gtImages[idx]["name"].split( "/")[-1].split(".")[0]] = {"bbox": rect, "det": [False] * len(rect)} for i, recti in enumerate(rect): if recti[5] == 0: tot = tot + 1 imname = [] cnt = 0 tp = numpy.zeros(len(detlist)) fp = numpy.zeros(len(detlist)) thr = numpy.zeros(len(detlist)) detlist.sort(cmpscore) for idx, detbb in enumerate(detlist): found = False maxovr = 0 gt = 0 if dimg.has_key(detbb[0]): rect = dimg[detbb[0]]["bbox"] found = False for ir, r in enumerate(rect): rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]), float(detbb[4])) if pixels == None: covr = overlap(rb, r) else: covr = overlapx(rb, r, pixels) if covr >= maxovr: maxovr = covr gt = ir if maxovr > ovr: if dimg[detbb[0]]["bbox"][gt][5] == 0: if not(dimg[detbb[0]]["det"][gt]): tp[idx] = 1 dimg[detbb[0]]["det"][gt] = True gtx = dimg[detbb[0]]["bbox"][gt][ 3] - dimg[detbb[0]]["bbox"][gt][1] dtx = detbb[4] - detbb[2] gty = dimg[detbb[0]]["bbox"][gt][ 2] - dimg[detbb[0]]["bbox"][gt][0] dty = detbb[5] - detbb[3] gtcx = ( dimg[detbb[0]]["bbox"][gt][3] + dimg[detbb[0]]["bbox"][gt][1]) / 2. dtcx = (detbb[4] + detbb[2]) / 2. gtcy = ( dimg[detbb[0]]["bbox"][gt][2] + dimg[detbb[0]]["bbox"][gt][0]) / 2. dtcy = (detbb[5] + detbb[3]) / 2. tx.append((gtcx - dtcx) / float(dtx)) ty.append((gtcy - dtcy) / float(dty)) sx.append(gtx / float(dtx)) sy.append(gty / float(dty)) else: fp[idx] = 1 else: fp[idx] = 1 thr[idx] = detbb[1] if show: prec = numpy.sum(tp) / float(numpy.sum(tp) + numpy.sum(fp)) rec = numpy.sum(tp) / tot print("Scr:", detbb[1], "Prec:%.3f" % prec, "Rec:%.3f" % rec) ss = raw_input() if ss == "s" or not(found): pylab.ioff() img = gtImages.getImageByName2(detbb[0]) pylab.figure(1) pylab.clf() pylab.imshow(img) rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]), float(detbb[4])) for r in rect: pylab.figure(1) pylab.ioff() box(r[0], r[1], r[2], r[3], 'b', lw=1.5) if found: box(rb[0], rb[1], rb[2], rb[3], 'g', lw=1.5) else: box(rb[0], rb[1], rb[2], rb[3], 'r', lw=1.5) pylab.draw() pylab.show() rect = [] return tp, fp, thr, tot, tx, ty, sx, sy
def newAnnotation(request): #import pdb #pdb.set_trace() document_id = int(request.POST['document_id']) text = request.POST['newNec'] annotation_id = int(request.POST['newNecCategoryId']) if annotation_id == 'Delete': return documentByAnnotator(request, document_id, annotator_id=request.user.id, error=None) if not text: return documentByAnnotator( request, document_id, annotator_id=request.user.id, error="Please select text to add new anntations") annotator = Annotator.objects.get(id=request.user.id) annotation_type = AnnotationType.objects.get(id=int(annotation_id)) document = Document.objects.get(id=document_id) annotations = Annotation.objects.filter(document=document, annotator=annotator) indices = util.findIndices(document.text, text) for ind in indices: if Annotation.objects.filter(document=document, begin_index=ind[0], end_index=ind[1], annotator=annotator): continue begin = ind[0] end = ind[1] allNamedEntities = Annotation.objects.filter(document=document, annotator=annotator) foundOverlap = False for absNE in allNamedEntities: if util.overlap((absNE.begin_index, absNE.end_index), (ind[0], ind[1])): foundOverlap = True #absNE.delete() # blah! # don't erase any existing entities if foundOverlap: continue #add new one annotation = Annotation(document=document, annotation=text, begin_index=begin, end_index=end, annotation_type=annotation_type, annotator=annotator) annotation.save() annotations = Annotation.objects.filter(document=document, annotator=annotator) text = util.htmlFormat(document.text, annotations) #return documentByAnnotator(request, document_id, annotator_id=request.user.id, error=None) return HttpResponse(text)
def run(self): if self.options.debubble: self.loadBubbleCircles() #read1_file is required read1_file = fastq.Reader(self.options.read1_file) #no front trim if sequence is barcoded if self.options.barcode: self.options.trim_front = 0 reporter = QCReporter() self.r1qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) self.r2qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) self.r1qc_prefilter.statFile(self.options.read1_file) if self.options.read2_file != None: self.r2qc_prefilter.statFile(self.options.read2_file) self.r1qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) self.r2qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) readLen = self.r1qc_prefilter.readLen overlap_histgram = [0 for x in xrange(readLen + 1)] distance_histgram = [0 for x in xrange(readLen + 1)] #auto detect trim front and trim tail if self.options.trim_front == -1 or self.options.trim_tail == -1: #auto trim for read1 trimFront, trimTail = self.r1qc_prefilter.autoTrim() if self.options.trim_front == -1: self.options.trim_front = trimFront if self.options.trim_tail == -1: self.options.trim_tail = trimTail #auto trim for read2 if self.options.read2_file != None: # check if we should keep same trimming for read1/read2 to keep their length identical # this option is on by default because lots of dedup algorithms require this feature if self.options.trim_pair_same: self.options.trim_front2 = self.options.trim_front self.options.trim_tail2 = self.options.trim_tail else: trimFront2, trimTail2 = self.r2qc_prefilter.autoTrim() if self.options.trim_front2 == -1: self.options.trim_front2 = trimFront2 if self.options.trim_tail2 == -1: self.options.trim_tail2 = trimTail2 print(self.options.read1_file + " options:") print(self.options) #if good output folder not specified, set it as the same folder of read1 file good_dir = self.options.good_output_folder if good_dir == None: good_dir = os.path.dirname(self.options.read1_file) #if bad output folder not specified, set it as the same folder of read1 file bad_dir = self.options.bad_output_folder if bad_dir == None: bad_dir = os.path.join( os.path.dirname(os.path.dirname(good_dir + "/")), "bad") #if overlap output folder not specified, set it as the same folder of read1 file overlap_dir = self.options.overlap_output_folder if overlap_dir == None: # overlap_dir = os.path.dirname(self.options.read1_file) overlap_dir = os.path.join( os.path.dirname(os.path.dirname(good_dir + "/")), "overlap") #save QC results at the same folder of good qc_base_folder = self.options.report_output_folder if qc_base_folder == None: qc_base_folder = os.path.join( os.path.dirname(os.path.dirname(good_dir + "/")), "QC") if not os.path.exists(qc_base_folder): os.makedirs(qc_base_folder) qc_dir = qc_base_folder if not os.path.exists(good_dir): os.makedirs(good_dir) if not os.path.exists(bad_dir): os.makedirs(bad_dir) if self.options.store_overlap and self.options.read2_file != None and ( not os.path.exists(overlap_dir)): os.makedirs(overlap_dir) gzip_out = self.options.gzip gzip_comp = self.options.compression if not gzip_out and self.options.read1_file.endswith(".gz"): gzip_out = True good_read1_file = None bad_read1_file = None overlap_read1_file = None if not self.options.qc_only: good_read1_file = fastq.Writer( os.path.join(good_dir, getMainName(self.options.read1_file) + ".good.fq"), gzip_out, gzip_comp) bad_read1_file = fastq.Writer( os.path.join(bad_dir, getMainName(self.options.read1_file) + ".bad.fq"), gzip_out, gzip_comp) overlap_read1_file = None if self.options.store_overlap: overlap_read1_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.read1_file) + ".overlap.fq"), gzip_out, gzip_comp) #other files are optional read2_file = None good_read2_file = None bad_read2_file = None overlap_read2_file = None index1_file = None good_index1_file = None bad_index1_file = None overlap_index1_file = None index2_file = None good_index2_file = None bad_index2_file = None overlap_index2_file = None #if other files are specified, then read them if self.options.read2_file != None: read2_file = fastq.Reader(self.options.read2_file) if not self.options.qc_only: good_read2_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.read2_file) + ".good.fq"), gzip_out, gzip_comp) bad_read2_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.read2_file) + ".bad.fq"), gzip_out, gzip_comp) if self.options.store_overlap and self.options.read2_file != None: overlap_read2_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.read2_file) + ".overlap.fq"), gzip_out, gzip_comp) if self.options.index1_file != None: index1_file = fastq.Reader(self.options.index1_file) if not self.options.qc_only: good_index1_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.index1_file) + ".good.fq"), gzip_out, gzip_comp) bad_index1_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.index1_file) + ".bad.fq"), gzip_out, gzip_comp) if self.options.store_overlap and self.options.read2_file != None: overlap_index1_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.index1_file) + ".overlap.fq"), gzip_out, gzip_comp) if self.options.index2_file != None: index2_file = fastq.Reader(self.options.index2_file) if not self.options.qc_only: good_index2_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.index2_file) + ".good.fq"), gzip_out, gzip_comp) bad_index2_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.index2_file) + ".bad.fq"), gzip_out, gzip_comp) if self.options.store_overlap and self.options.read2_file != None: overlap_index2_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.index2_file) + ".overlap.fq"), gzip_out, gzip_comp) r1 = None r2 = None i1 = None i2 = None # stat numbers TOTAL_BASES = 0 GOOD_BASES = 0 TOTAL_READS = 0 GOOD_READS = 0 BAD_READS = 0 BADBCD1 = 0 BADBCD2 = 0 BADTRIM1 = 0 BADTRIM2 = 0 BADBBL = 0 BADLEN = 0 BADPOL = 0 BADLQC = 0 BADNCT = 0 BADINDEL = 0 BADMISMATCH = 0 BADDIFF = 0 READ_CORRECTED = 0 BASE_CORRECTED = 0 BASE_SKIPPED_CORRECTION = 0 BASE_ZERO_QUAL_MASKED = 0 OVERLAPPED = 0 OVERLAP_LEN_SUM = 0 OVERLAP_BASE_SUM = 0 # error profiling by overlap analysis OVERLAP_BASE_ERR = 0 OVERLAP_ERR_MATRIX = init_error_matrix() #adapter trimming by overlap analysis TRIMMED_ADAPTER_BASE = 0 TRIMMED_ADAPTER_READ = 0 while True: r1 = read1_file.nextRead() if r1 == None: break else: TOTAL_BASES += len(r1[1]) if read2_file != None: r2 = read2_file.nextRead() if r2 == None: break if index1_file != None: i1 = index1_file.nextRead() if i1 == None: break if index2_file != None: i2 = index2_file.nextRead() if i2 == None: break else: TOTAL_BASES += len(r2[1]) TOTAL_READS += 1 #barcode processing if self.options.barcode: barcodeLen1 = barcodeprocesser.detectBarcode( r1[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen1 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD1") BADBCD1 += 1 continue else: if r2 == None: barcodeprocesser.moveBarcodeToName( r1, self.options.barcode_length, self.options.barcode_verify) else: barcodeLen2 = barcodeprocesser.detectBarcode( r2[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen2 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD2") BADBCD2 += 1 continue else: barcodeprocesser.moveAndTrimPair( r1, r2, barcodeLen1, barcodeLen2, self.options.barcode_verify) #trim if self.options.trim_front > 0 or self.options.trim_tail > 0: r1 = trim(r1, self.options.trim_front, self.options.trim_tail) if len(r1[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM1") BADTRIM1 += 1 continue if r2 != None: r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2) if len(r2[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM2") BADTRIM2 += 1 continue #filter debubble if self.options.debubble: if self.isInBubble(r1[0]): self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL") BADBBL += 1 continue #filter sequence length if len(r1[1]) < self.options.seq_len_req: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN") BADLEN += 1 continue #check polyX if self.options.poly_size_limit > 0: poly1 = hasPolyX(r1[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) poly2 = None if r2 != None: poly2 = hasPolyX(r2[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) if poly1 != None or poly2 != None: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL") BADPOL += 1 continue #check low quality count if self.options.unqualified_base_limit > 0: lowQual1 = lowQualityNum(r1, self.options.qualified_quality_phred) lowQual2 = 0 if r2 != None: lowQual2 = lowQualityNum( r2, self.options.qualified_quality_phred) if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC") BADLQC += 1 continue #check N number if self.options.n_base_limit > 0: nNum1 = nNumber(r1) nNum2 = 0 if r2 != None: nNum2 = nNumber(r2) if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT") BADNCT += 1 continue #check overlap and do error correction if r2 != None and (not self.options.no_overlap): (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) overlap_histgram[overlap_len] += 1 # deal with the case insert DNA is shorter than read length and cause offset is negative # in this case the adapter is sequenced and should be trimmed if offset < 0 and overlap_len > 30: # shift the junk bases r1[1] = r1[1][0:overlap_len] r1[3] = r1[3][0:overlap_len] r2[1] = r2[1][0:overlap_len] r2[3] = r2[3][0:overlap_len] TRIMMED_ADAPTER_BASE += abs(offset) * 2 TRIMMED_ADAPTER_READ += 1 # check the sequence length again after adapter trimmed if len(r1[1]) < self.options.seq_len_req: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN") BADLEN += 1 continue # then calc overlap again (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) distance_histgram[distance] += 1 # if distance is too high, then set it as bad mismatch if distance > 3: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADDIFF") BADDIFF += 1 continue if overlap_len > 30: OVERLAPPED += 1 OVERLAP_LEN_SUM += overlap_len # we consider the distance is caused by sequencing error OVERLAP_BASE_SUM += overlap_len * 2 OVERLAP_BASE_ERR += distance corrected = 0 zero_qual_masked = 0 skipped_mismatch = 0 if distance > 0: #try to fix low quality base #hamming = util.hammingDistance(r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #if hamming != distance: # self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL") # BADINDEL += 1 # continue #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) err_mtx = init_error_matrix() for o in xrange(overlap_len): b1 = r1[1][len(r1[1]) - overlap_len + o] b2 = util.complement(r2[1][-o - 1]) q1 = r1[3][len(r1[3]) - overlap_len + o] q2 = r2[3][-o - 1] if b1 != b2: # print(TOTAL_READS, o, b1, b2, q1, q2) this_is_corrected = False if util.qualNum(q1) >= 30 and util.qualNum( q2) <= 14: if b1 != 'N' and b2 != 'N': err_mtx[util.complement(b1)][ util.complement(b2)] += 1 if not self.options.no_correction: r2[1] = util.changeString( r2[1], -o - 1, util.complement(b1)) r2[3] = util.changeString( r2[3], -o - 1, q1) corrected += 1 this_is_corrected = True elif util.qualNum(q2) >= 30 and util.qualNum( q1) <= 14: if b1 != 'N' and b2 != 'N': err_mtx[b2][b1] += 1 if not self.options.no_correction: r1[1] = util.changeString( r1[1], len(r1[1]) - overlap_len + o, b2) r1[3] = util.changeString( r1[3], len(r1[3]) - overlap_len + o, q2) corrected += 1 this_is_corrected = True if not this_is_corrected: if self.options.mask_mismatch: # mask them as zero qual if it is not corrected zero_qual = '!' r2[3] = util.changeString( r2[3], -o - 1, zero_qual) r1[3] = util.changeString( r1[3], len(r1[3]) - overlap_len + o, zero_qual) zero_qual_masked += 1 else: skipped_mismatch += 1 if corrected + zero_qual_masked + skipped_mismatch >= distance: break #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) if corrected + zero_qual_masked + skipped_mismatch == distance: merge_error_matrix(OVERLAP_ERR_MATRIX, err_mtx) if corrected > 0: READ_CORRECTED += 1 BASE_CORRECTED += corrected # multiply by 2 since we mask bases by pair BASE_ZERO_QUAL_MASKED += zero_qual_masked * 2 BASE_SKIPPED_CORRECTION += skipped_mismatch * 2 else: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMISMATCH") BADMISMATCH += 1 continue if distance == 0 or distance == corrected: if self.options.store_overlap: self.writeReads(getOverlap(r1, overlap_len), getOverlap(r2, overlap_len), i1, i2, overlap_read1_file, overlap_read2_file, overlap_index1_file, overlap_index2_file, None) #write to good self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None) GOOD_BASES += len(r1[1]) if i2 != None: GOOD_BASES += len(r2[1]) if self.options.qc_sample <= 0 or TOTAL_READS < self.options.qc_sample: self.r1qc_postfilter.statRead(r1) if r2 != None: self.r2qc_postfilter.statRead(r2) GOOD_READS += 1 if self.options.qc_only and TOTAL_READS >= self.options.qc_sample: break self.r1qc_postfilter.qc() #self.r1qc_postfilter.plot(qc_dir, "R1-postfilter") if self.options.read2_file != None: self.r2qc_postfilter.qc() #self.r2qc_postfilter.plot(qc_dir, "R2-postfilter") #close all files if not self.options.qc_only: good_read1_file.close() bad_read1_file.close() if self.options.read2_file != None: good_read2_file.close() bad_read2_file.close() if self.options.index1_file != None: good_index1_file.close() bad_index1_file.close() if self.options.index2_file != None: good_index2_file.close() bad_index2_file.close() # print stat numbers BAD_READS = TOTAL_READS - GOOD_READS result = {} result['total_bases'] = TOTAL_BASES result['good_bases'] = GOOD_BASES result['total_reads'] = TOTAL_READS result['good_reads'] = GOOD_READS result['bad_reads'] = BAD_READS result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2 result['bad_reads_with_reads_in_bubble'] = BADBBL result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2 result['bad_reads_with_polyX'] = BADPOL result['bad_reads_with_low_quality'] = BADLQC result['bad_reads_with_too_many_N'] = BADNCT result['bad_reads_with_bad_overlap'] = BADMISMATCH + BADINDEL + BADDIFF result['readlen'] = readLen # plot result bar figure labels = [ 'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N' ] counts = [ GOOD_READS, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT ] colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899'] if self.options.read2_file != None: labels.append('bad_overlap') counts.append(BADMISMATCH + BADINDEL + BADDIFF) colors.append('#FF6600') if self.options.debubble: labels.append('in_bubble') counts.append(BADBBL) colors.append('#EEBB00') if self.options.barcode: labels.append('bad_barcode') counts.append(BADBCD1 + BADBCD2) colors.append('#CCDD22') for i in xrange(len(counts)): type_percent = 0.0 if TOTAL_READS > 0: type_percent = 100.0 * float(counts[i]) / TOTAL_READS labels[i] = labels[i] + ": " + str( counts[i]) + "(" + str(type_percent) + "%)" reporter.addFigure( 'Good reads and bad reads after filtering', self.r1qc_prefilter.statPlotly(labels, counts, TOTAL_READS, 'filter_stat'), 'filter_stat', "") #self.r1qc_prefilter.plotFilterStats(labels, counts, colors, TOTAL_READS, os.path.join(qc_dir, "filter-stat.png")) #squeeze qc data for JSON output self.r1qc_prefilter.squeeze() self.r1qc_postfilter.squeeze() if self.options.read2_file != None: self.r2qc_prefilter.squeeze() self.r2qc_postfilter.squeeze() stat = {} # stat["options"]=self.options stat["afterqc_main_summary"] = result stat["command"] = makeDict(self.options) stat["kmer_content"] = {} stat["kmer_content"][ "read1_prefilter"] = self.r1qc_prefilter.topKmerCount[0:10] stat["kmer_content"][ "read1_postfilter"] = self.r1qc_postfilter.topKmerCount[0:10] # output more data in JSON file for offline plotting directly from JSON stat["base_quality"] = {} stat["base_quality"][ "read1_prefilter"] = self.r1qc_prefilter.baseMeanQual stat["base_quality"][ "read1_postfilter"] = self.r1qc_postfilter.baseMeanQual stat["mean_quality"] = {} stat["mean_quality"]["read1_prefilter"] = self.r1qc_prefilter.meanQual stat["mean_quality"][ "read1_postfilter"] = self.r1qc_postfilter.meanQual stat["base_content"] = {} stat["base_content"]["read1_prefilter"] = self.r1qc_prefilter.percents stat["base_content"][ "read1_postfilter"] = self.r1qc_postfilter.percents stat["gc_content"] = {} stat["gc_content"]["read1_prefilter"] = self.r1qc_prefilter.gcPercents stat["gc_content"][ "read1_postfilter"] = self.r1qc_postfilter.gcPercents if self.options.read2_file != None: stat["kmer_content"][ "read2_prefilter"] = self.r2qc_prefilter.topKmerCount[0:10] stat["kmer_content"][ "read2_postfilter"] = self.r2qc_postfilter.topKmerCount[0:10] stat["base_quality"][ "read2_prefilter"] = self.r2qc_prefilter.baseMeanQual stat["base_quality"][ "read2_postfilter"] = self.r2qc_postfilter.baseMeanQual stat["mean_quality"][ "read2_prefilter"] = self.r2qc_prefilter.meanQual stat["mean_quality"][ "read2_postfilter"] = self.r2qc_postfilter.meanQual stat["base_content"][ "read2_prefilter"] = self.r2qc_prefilter.percents stat["base_content"][ "read2_postfilter"] = self.r2qc_postfilter.percents stat["gc_content"][ "read2_prefilter"] = self.r2qc_prefilter.gcPercents stat["gc_content"][ "read2_postfilter"] = self.r2qc_postfilter.gcPercents stat["afterqc_overlap"] = {} stat["afterqc_overlap"]['overlapped_pairs'] = OVERLAPPED if OVERLAPPED > 0: stat["afterqc_overlap"]['average_overlap_length'] = float( OVERLAP_LEN_SUM / OVERLAPPED) else: stat["afterqc_overlap"]['average_overlap_length'] = 0.0 stat["afterqc_overlap"]['bad_mismatch_reads'] = BADMISMATCH stat["afterqc_overlap"]['bad_diff'] = BADDIFF stat["afterqc_overlap"]['bad_indel_reads'] = BADINDEL stat["afterqc_overlap"]['corrected_reads'] = READ_CORRECTED stat["afterqc_overlap"]['corrected_bases'] = BASE_CORRECTED stat["afterqc_overlap"][ 'skipped_correction_bases'] = BASE_SKIPPED_CORRECTION stat["afterqc_overlap"]['zero_qual_masked'] = BASE_ZERO_QUAL_MASKED stat["afterqc_overlap"][ 'zero_qual_skipped'] = BASE_ZERO_QUAL_MASKED stat["afterqc_overlap"][ 'trimmed_adapter_bases'] = TRIMMED_ADAPTER_BASE stat["afterqc_overlap"][ 'trimmed_adapter_reads'] = TRIMMED_ADAPTER_READ if OVERLAP_BASE_SUM > 0: stat["afterqc_overlap"]['error_rate'] = float( OVERLAP_BASE_ERR) / float(OVERLAP_BASE_SUM) else: stat["afterqc_overlap"]['error_rate'] = 0.0 stat["afterqc_overlap"]['error_matrix'] = OVERLAP_ERR_MATRIX stat["afterqc_overlap"][ 'edit_distance_histogram'] = distance_histgram[0:10] reporter.addFigure( 'Sequence error distribution', self.r1qc_prefilter.errorPlotly(OVERLAP_ERR_MATRIX, 'error_matrix'), 'error_matrix', "") reporter.addFigure( 'Overlap length distribution', self.r1qc_prefilter.overlapPlotly(overlap_histgram, readLen, TOTAL_READS, 'overlap_stat'), 'overlap_stat', "") #self.r1qc_prefilter.plotOverlapHistgram(overlap_histgram, readLen, TOTAL_READS, os.path.join(qc_dir, "overlap.png")) stat_file = open( os.path.join(qc_dir, os.path.basename(self.options.read1_file) + ".json"), "w") stat_json = json.dumps(stat, sort_keys=True, indent=4, separators=(',', ': ')) stat_file.write(stat_json) stat_file.close() self.addFiguresToReport(reporter) reporter.setStat(stat) reporter.setVersion(self.options.version) reporter.output( os.path.join(qc_dir, os.path.basename(self.options.read1_file) + ".html"))
def VOCprlist(gtImages, detlist, show=False, usetr=True, usedf=False, ovr=0.5): """ calculate the precision recall curve """ #detf=open(detfile,"r") #detect=detf.readlines() imname = [] cnt = 0 #ovr=0.49 #print trPosImages.getTotal() tp = [] fp = [] tot = 0 for idx in range(gtImages.getTotal()): print gtImages.getImageName(idx) if show: img = gtImages.getImage(idx) pylab.figure(1) pylab.clf() pylab.imshow(img) #pyr=HOGcompute.HOGcrop(img,interv=interv) #pyr.pad() #pyr.pad() #pyr.contrast() rect = gtImages.getBBox(idx, usetr=usetr, usedf=usedf) print rect if show: for r in rect: pylab.figure(1) pylab.ioff() box(r[0], r[1], r[2], r[3], 'b', lw=1.5) #raw_input() tot = tot + len(rect) #print len(rect),rect #print rect for l in detlist: data = l #.split(" ") if data[0] == gtImages.getImageName(idx).split("/")[-1].split( ".")[0]: notfound = True rb = [ float(data[3]), float(data[2]), float(data[5]), float(data[4]) ] if show: pylab.ioff() pylab.text(rb[1], rb[0], data[1]) for id, r in enumerate(rect): #pylab.figure(1) #box(r[0],r[1],r[2],r[3],'b',lw=1.5) #print "entered",data #rb=[float(data[3]),float(data[2]),float(data[5]),float(data[4])] #print rb,r,overlap(rb,r) #pylab.text(rb[1],rb[0],data[1]) if overlap(rb, r) >= ovr: if show: pylab.ioff() box(rb[0], rb[1], rb[2], rb[3], 'g', lw=1.5) del rect[id] tp.append(float(data[1])) notfound = False break if notfound == True: if show: pylab.ioff() box(rb[0], rb[1], rb[2], rb[3], 'r', lw=1) fp.append(float(data[1])) #print len(tp),len(fp),tot #break if show: pylab.figure(1) pylab.show() pylab.draw() #raw_input() return tp, fp, tot
# except: # try: # img=util.myimread(imgpath+"buffy_s5e4/"+l["idim"]) # except: # try: # img=util.myimread(imgpath+"buffy_s5e5/"+l["idim"]) # except: # try: # img=util.myimread(imgpath+"buffy_s5e6/"+l["idim"]) # except: # pass #gooddet=-1 ovr=[] for idb,b in enumerate(gt[l["idim"]]):#for each bb gt ovr.append(util.overlap(b,l["bbox"])) if len(ovr)>0: #print "Best ovr",max(ovr) if max(ovr)>=0.5: detectCRF.visualize2([l],cfg.N,img,text="rank:%d ovr:%.3f scl:%d"%(idl,max(ovr),l["hog"]),bb=gt[l["idim"]][numpy.array(ovr).argmax()],color="w",line=line) else: detectCRF.visualize2([l],cfg.N,img,text="rank:%d ovr:%.3f scl:%d"%(idl,max(ovr),l["hog"]),bb=gt[l["idim"]][numpy.array(ovr).argmax()],color="r",line=line) else: detectCRF.visualize2([l],cfg.N,img,text="rank:%d"%(idl),color="r",line=line) #pl.figure(100) #pl.clf() #pl.imshow(img) raw_input()
def run(self): if self.options.debubble: self.loadBubbleCircles() #read1_file is required read1_file = fastq.Reader(self.options.read1_file) #create a QC folder to contains QC results qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file), "QC") if not os.path.exists(qc_base_folder): os.makedirs(qc_base_folder) #QC result of this file/pair qc_dir = os.path.join(qc_base_folder, os.path.basename(self.options.read1_file)) if not os.path.exists(qc_dir): os.makedirs(qc_dir) #no front trim if sequence is barcoded if self.options.barcode: self.options.trim_front = 0 reporter = QCReporter() r1qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r2qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r1qc_prefilter.statFile(self.options.read1_file) r1qc_prefilter.plot(qc_dir, "R1-prefilter") if self.options.read2_file != None: r2qc_prefilter.statFile(self.options.read2_file) r2qc_prefilter.plot(qc_dir, "R2-prefilter") r1qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r2qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) readLen = r1qc_prefilter.readLen overlap_histgram = [0 for x in xrange(readLen+1)] distance_histgram = [0 for x in xrange(readLen+1)] #auto detect trim front and trim tail if self.options.trim_front == -1 or self.options.trim_tail == -1: #auto trim for read1 trimFront, trimTail = r1qc_prefilter.autoTrim() if self.options.trim_front == -1: self.options.trim_front = trimFront if self.options.trim_tail == -1: self.options.trim_tail = trimTail #auto trim for read2 if self.options.read2_file != None: # check if we should keep same trimming for read1/read2 to keep their length identical # this option is on by default because lots of dedup algorithms require this feature if self.options.trim_pair_same: self.options.trim_front2 = self.options.trim_front self.options.trim_tail2 = self.options.trim_tail else: trimFront2, trimTail2 = r2qc_prefilter.autoTrim() if self.options.trim_front2 == -1: self.options.trim_front2 = trimFront2 if self.options.trim_tail2 == -1: self.options.trim_tail2 = trimTail2 print(self.options.read1_file + " options:") print(self.options) #if good output folder not specified, set it as the same folder of read1 file good_dir = self.options.good_output_folder if good_dir == None: good_dir = os.path.dirname(self.options.read1_file) #if bad output folder not specified, set it as the same folder of read1 file bad_dir = self.options.bad_output_folder if bad_dir == None: bad_dir = os.path.dirname(self.options.read1_file) #if overlap output folder not specified, set it as the same folder of read1 file overlap_dir = self.options.overlap_output_folder if overlap_dir == None: overlap_dir = os.path.dirname(self.options.read1_file) if not os.path.exists(good_dir): os.makedirs(good_dir) if not os.path.exists(bad_dir): os.makedirs(bad_dir) if self.options.store_overlap and self.options.read2_file != None and (not os.path.exists(overlap_dir)): os.makedirs(overlap_dir) good_read1_file = None bad_read1_file = None overlap_read1_file = None if not self.options.qc_only: good_read1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read1_file)+".good.fq")) bad_read1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read1_file)+".bad.fq")) overlap_read1_file = None if self.options.store_overlap: overlap_read1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read1_file)+".overlap.fq")) #other files are optional read2_file = None good_read2_file = None bad_read2_file = None overlap_read2_file = None index1_file = None good_index1_file = None bad_index1_file = None overlap_index1_file = None index2_file = None good_index2_file = None bad_index2_file = None overlap_index2_file = None #if other files are specified, then read them if self.options.read2_file != None: read2_file = fastq.Reader(self.options.read2_file) if not self.options.qc_only: good_read2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.read2_file)+".good.fq")) bad_read2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.read2_file)+".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_read2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.read2_file)+".overlap.fq")) if self.options.index1_file != None: index1_file = fastq.Reader(self.options.index1_file) if not self.options.qc_only: good_index1_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index1_file)+".good.fq")) bad_index1_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index1_file)+".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_index1_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index1_file)+".overlap.fq")) if self.options.index2_file != None: index2_file = fastq.Reader(self.options.index2_file) if not self.options.qc_only: good_index2_file = fastq.Writer(os.path.join(good_dir, getMainName(self.options.index2_file)+".good.fq")) bad_index2_file = fastq.Writer(os.path.join(bad_dir, getMainName(self.options.index2_file)+".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_index2_file = fastq.Writer(os.path.join(overlap_dir, getMainName(self.options.index2_file)+".overlap.fq")) r1 = None r2 = None i1 = None i2 = None # stat numbers TOTAL = 0 GOOD = 0 BAD = 0 BADBCD1 = 0 BADBCD2 = 0 BADTRIM1 = 0 BADTRIM2 = 0 BADBBL = 0 BADLEN = 0 BADPOL = 0 BADLQC = 0 BADNCT = 0 BADOL = 0 BADINDEL = 0 BADMISMATCH = 0 BASE_CORRECTED = 0 OVERLAPPED = 0 OVERLAP_LEN_SUM = 0 while True: r1 = read1_file.nextRead() if r1==None: break if read2_file != None: r2 = read2_file.nextRead() if r2==None: break if index1_file != None: i1 = index1_file.nextRead() if i1==None: break if index2_file != None: i2 = index2_file.nextRead() if i2==None: break TOTAL += 1 #barcode processing if self.options.barcode: barcodeLen1 = barcodeprocesser.detectBarcode(r1[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen1 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD1") BADBCD1 += 1 continue else: if r2 == None: barcodeprocesser.moveBarcodeToName(r1, self.options.barcode_length, self.options.barcode_verify) else: barcodeLen2 = barcodeprocesser.detectBarcode(r2[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen2 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD2") BADBCD2 += 1 continue else: barcodeprocesser.moveAndTrimPair(r1, r2, barcodeLen1, barcodeLen2, self.options.barcode_verify) #trim if self.options.trim_front > 0 or self.options.trim_tail > 0: r1 = trim(r1, self.options.trim_front, self.options.trim_tail) if len(r1[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM1") BADTRIM1 += 1 continue if r2 != None: r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2) if len(r2[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM2") BADTRIM2 += 1 continue #filter debubble if self.options.debubble: if self.isInBubble(r1[0]): self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL") BADBBL += 1 continue #filter sequence length if len(r1[1])<self.options.seq_len_req: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN") BADLEN += 1 continue #check polyX if self.options.poly_size_limit > 0: poly1 = hasPolyX(r1[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) poly2 = None if r2!=None: poly2 = hasPolyX(r2[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) if poly1!=None or poly2!=None: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL") BADPOL += 1 continue #check low quality count if self.options.unqualified_base_limit > 0: lowQual1 = lowQualityNum(r1, self.options.qualified_quality_phred) lowQual2 = 0 if r2!=None: lowQual2 = lowQualityNum(r2, self.options.qualified_quality_phred) if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC") BADLQC += 1 continue #check N number if self.options.n_base_limit > 0: nNum1 = nNumber(r1) nNum2 = 0 if r2!=None: nNum2 = nNumber(r2) if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT") BADNCT += 1 continue #check overlap and do error correction if r2!=None: (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) overlap_histgram[overlap_len] += 1 # deal with the case insert DNA is shorter than read length and cause offset is negative if offset <0 and overlap_len > 30: # shift the junk bases r1[1] = r1[1][0:overlap_len] r1[3] = r1[3][0:overlap_len] r2[1] = r2[1][-offset:-offset+overlap_len] r2[3] = r2[3][-offset:-offset+overlap_len] # then calc overlap again (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) if overlap_len>30: OVERLAPPED += 1 distance_histgram[distance] += 1 OVERLAP_LEN_SUM += overlap_len corrected = 0 if distance > 2: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADOL") BADOL += 1 continue elif distance>0: #try to fix low quality base hamming = util.hammingDistance(r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) if hamming != distance: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL") BADINDEL += 1 continue #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) for o in xrange(overlap_len): b1 = r1[1][len(r1[1]) - overlap_len + o] b2 = util.complement(r2[1][-o-1]) q1 = r1[3][len(r1[3]) - overlap_len + o] q2 = r2[3][-o-1] if b1 != b2: # print(TOTAL, o, b1, b2, q1, q2) if util.qualNum(q1) >= 27 and util.qualNum(q2) <= 16: r2[1] = util.changeString(r2[1], -o-1, util.complement(b1)) r2[3] = util.changeString(r2[3], -o-1, q1) corrected += 1 elif util.qualNum(q2) >= 27 and util.qualNum(q1) <= 16: r1[1]= util.changeString(r1[1], len(r1[1]) - overlap_len + o, b2) r1[3] = util.changeString(r1[3], len(r1[3]) - overlap_len + o, q2) corrected += 1 if corrected >= distance: break #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) if corrected == distance: BASE_CORRECTED += 1 else: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMISMATCH") BADMISMATCH += 1 continue if distance == 0 or distance == corrected: if self.options.store_overlap: self.writeReads(getOverlap(r1, overlap_len), getOverlap(r2, overlap_len), i1, i2, overlap_read1_file, overlap_read2_file, overlap_index1_file, overlap_index2_file, None) #write to good self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None) r1qc_postfilter.statRead(r1) if r2 != None: r2qc_postfilter.statRead(r2) GOOD += 1 if self.options.qc_only and TOTAL >= self.options.qc_sample: break r1qc_postfilter.qc() r1qc_postfilter.plot(qc_dir, "R1-postfilter") if self.options.read2_file != None: r2qc_postfilter.qc() r2qc_postfilter.plot(qc_dir, "R2-postfilter") #close all files if not self.options.qc_only: good_read1_file.flush() bad_read1_file.flush() if self.options.read2_file != None: good_read2_file.flush() bad_read2_file.flush() if self.options.index1_file != None: good_index1_file.flush() bad_index1_file.flush() if self.options.index2_file != None: good_index2_file.flush() bad_index2_file.flush() # print stat numbers BAD = TOTAL - GOOD result = {} result['total_reads']=TOTAL result['good_reads']=GOOD result['bad_reads']=BAD result['bad_reads_with_bad_barcode']= BADBCD1 + BADBCD2 result['bad_reads_with_reads_in_bubble']= BADBBL result['bad_reads_with_bad_read_length']= BADLEN + BADTRIM1 + BADTRIM2 result['bad_reads_with_polyX']= BADPOL result['bad_reads_with_low_quality']=BADLQC result['bad_reads_with_too_many_N']= BADNCT result['bad_reads_with_bad_overlap']= BADOL + BADMISMATCH + BADINDEL # plot result bar figure labels = ['good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N'] counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT] colors = ['green', '#FF1111', '#FF3333', '#FF5555', '#FF7777'] if self.options.read2_file != None: labels.append('bad_overlap') counts.append(BADOL + BADMISMATCH + BADINDEL) colors.append('#FF9999') if self.options.debubble: labels.append('in_bubble') counts.append(BADBBL) colors.append('#FFBBBB') if self.options.barcode: labels.append('bad_barcode') counts.append(BADBCD1 + BADBCD2) colors.append('#FFDDDD') fig = plt.figure(1) plt.title("Good reads (green) and bad reads (red) of total " + str(TOTAL)) fig.subplots_adjust(left = 0.14) lefts = xrange(len(counts)) plt.yticks(lefts, labels) plt.ylim(-0.5, len(counts)-0.5) plt.barh(lefts, counts, align='center', height=0.5, alpha=0.8, color=colors) plt.savefig(os.path.join(qc_dir, "filter-stat.png")) plt.close(1) stat={} # stat["options"]=self.options stat["summary"]=result stat["command"]=makeDict(self.options) stat["kmer_content"] = {} stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[0:10] stat["kmer_content"]["read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10] if self.options.read2_file != None: stat["kmer_content"]["read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10] stat["kmer_content"]["read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10] stat["overlap"]={} stat["overlap"]['overlapped_pairs']=OVERLAPPED if OVERLAPPED > 0: stat["overlap"]['average_overlap_length']=float(OVERLAP_LEN_SUM/OVERLAPPED) else: stat["overlap"]['average_overlap_length']=0.0 stat["overlap"]['bad_edit_distance']=BADOL stat["overlap"]['bad_mismatch_bases']=BADMISMATCH stat["overlap"]['bad_indel']=BADINDEL stat["overlap"]['reads_with_corrected_mismatch_bases']=BASE_CORRECTED stat["overlap"]['overlapped_area_edit_distance_histogram']=distance_histgram[0:10] plotOverlapHistgram(overlap_histgram, readLen, TOTAL, os.path.join(qc_dir, "overlap.png")) stat_file = open(os.path.join(qc_dir, "after.json"), "w") stat_json = json.dumps(stat, sort_keys=True,indent=4, separators=(',', ': ')) stat_file.write(stat_json) stat_file.close() self.addFiguresToReport(reporter) reporter.output(os.path.join(qc_dir, "report.html"))
def viewDet(gtImages,detfile,opt="all",usetr=True,usedf=False,stop=True,t=0.5): detf=open(detfile,"r") detect=detf.readlines() detlst=numpy.zeros((len(detect),5)) namelst=[] pylab.ioff() for id,el in enumerate(detect): aux=el.split() namelst.append(aux[0]) detlst[id,:]=aux[1:] srt=numpy.argsort(-detlst[:,0]) imname=[] cnt=0 ovr=0.49 #print trPosImages.getTotal() tp=[] fp=[] tot=0 pylab.figure() bb=numpy.zeros((4)) for id in range(detlst.shape[0]): pylab.ioff() abb=detlst[srt[id]] conf=abb[0] bb[0]=abb[2];bb[1]=abb[1];bb[2]=abb[4];bb[3]=abb[3] pylab.clf() img=gtImages.getImageByName2(namelst[srt[id]]) gtbb=gtImages.getBBoxByName(namelst[srt[id]],usetr=usetr,usedf=usedf) found=False for l in range(len(gtbb)): pylab.imshow(img) pylab.title("%s Confidence: %f"%(namelst[srt[id]],float(conf))) #box(gtbb[l][0],gtbb[l][1],gtbb[l][2],gtbb[l][3],col='b',lw="2") print overlap(bb[:],gtbb[l][:4]) if overlap(bb[:],gtbb[l][:4])>0: if overlap(bb[:],gtbb[l][:4])>ovr: box(gtbb[l][0],gtbb[l][1],gtbb[l][2],gtbb[l][3],col='y',lw="2") box(bb[0],bb[1],bb[2],bb[3],col='g',lw="2") pylab.show() pylab.draw() if stop: raw_input() else: time.sleep(t) found=True else: box(gtbb[l][0],gtbb[l][1],gtbb[l][2],gtbb[l][3],col='y',lw="1") #box(bb[0],bb[1],bb[2],bb[3],col='g',lw="2") #raw_input() else: pass #pylab.imshow(img) #box(bb[0],bb[1],bb[2],bb[3],col='r',lw="2") if not(found): pylab.imshow(img) box(bb[0],bb[1],bb[2],bb[3],col='r',lw="2") pylab.show() pylab.draw() if stop: raw_input() else: time.sleep(t)
def readMotifMatching(combinationList, coordDict, pwmFileNameList, color="black", pwmReferenceList=None): """Reads motif predicted binding sites files and creates necessary structures for the statistical test. Keyword arguments: combinationList -- List of the number of cobinding combinations. coordDict -- Dictionary of coordinates where the motif matching was applied. pwmFileNameList -- List of PWMs files where each entry's name will represent the name of the motif. Alternatively, it can be a single file containing all the MPBSs and their name on the NAME field. color -- Color of the bed entries. Can be 'green', 'red' or 'black'. (default 'black') pwmReferenceList -- Optional argument. In case pwmFileNameList is a single file (final motif matching file), this parameter can be set to be a pwmList that will preserve the order of the pwmList. This is useful in the case you want the same combinations of cobinding factors be created. (default None) Returns: mpbsDict -- Dictionary (for each PWM) of dictionaries (for each chromosome) of motif predicted binding sites. statDict -- Dictionary of statistics for Fisher test concerning the number of motifs inside enriched regions. geneDict -- Dictionary of genes (position NAME in bed file) that contains each motif. """ # Reading all MPBSs pwmList = [] allMpbsDict = dict() if (isinstance(pwmFileNameList, list)): for pwmFileName in pwmFileNameList: pwmList.append(".".join( pwmFileName.split("/")[-1].split(".")[:-1])) allMpbsDict[ pwmList[-1]] = bedFunctions.createBedDictFromSingleFile( pwmFileName, separator="\t") else: if (pwmReferenceList): pwmList = pwmReferenceList pwmFile = open(pwmFileNameList, "r") for line in pwmFile: ll = line.strip().split("\t") if (ll[3] in allMpbsDict.keys()): if (ll[0] in allMpbsDict[ll[3]].keys()): allMpbsDict[ll[3]][ll[0]].append( [int(ll[1]), int(ll[2]), ll[3], int(ll[4]), ll[5]]) else: allMpbsDict[ll[3]][ll[0]] = [[ int(ll[1]), int(ll[2]), ll[3], int(ll[4]), ll[5] ]] else: if (not pwmReferenceList): pwmList.append(ll[3]) allMpbsDict[ll[3]] = dict() allMpbsDict[ll[3]][ll[0]] = [[ int(ll[1]), int(ll[2]), ll[3], int(ll[4]), ll[5] ]] pwmFile.close() # Creating chromosome list chrList = constants.getChromList(reference=[coordDict]) # Removing chrX, chrY and chrM chrListT = [] for e in chrList: if (e not in ["chrX", "chrY", "chrM"]): chrListT.append(e) chrList = chrListT # Evaluating bed additionals if (color == "green"): color = "0,130,0" elif (color == "red"): color = "130,0,0" elif (color == "black"): color = "0,0,0" # Create combinations dictionary keys combKeys = [] for c in combinationList: for b in [",".join(e) for e in itertools.combinations(pwmList, c)]: combKeys.append(b) # Counting statistics mpbsDict = dict([(e, dict()) for e in pwmList]) statDict = dict([(e, [0, 0]) for e in combKeys ]) # Left is evidence / Right is not evidence geneDict = dict([(e, []) for e in combKeys]) for chrName in coordDict.keys(): for e in mpbsDict.keys(): mpbsDict[e][chrName] = [] # Creating chrName keys counter = dict([(e, 0) for e in pwmList ]) # Counters to iterate over all mpbs dict # Iterating on coordinates for coord in coordDict[chrName]: flagMotifs = dict([(e, False) for e in pwmList ]) # Motifs found on this coordinate # Searching for MPBSs that overlapped this coordinate for factorName in pwmList: while (counter[factorName] < len( allMpbsDict[factorName][chrName])): currMpbs = allMpbsDict[factorName][chrName][ counter[factorName]] check = util.overlap(coord, currMpbs) if (check == 0): # Contain overlap flagMotifs[factorName] = True mpbsDict[factorName][chrName].append( currMpbs + [currMpbs[0], currMpbs[1], color]) elif (check == -1): break # Motif is after coord counter[factorName] += 1 # Updating statistic counts and genes motifsFoundList = [k for k in pwmList if flagMotifs[k]] motifsFoundKeys = [] motifsNotFoundKeys = [e for e in combKeys] for c in combinationList: for b in [ ",".join(e) for e in itertools.combinations(motifsFoundList, c) ]: motifsFoundKeys.append(b) motifsNotFoundKeys.remove(b) for k in motifsFoundKeys: statDict[k][0] += 1 for e in coord[2].split(":"): geneDict[k].append(e) for k in motifsNotFoundKeys: statDict[k][1] += 1 # Remove repetitive genes from geneList for k in geneDict.keys(): geneDict[k] = list(set(geneDict[k])) return mpbsDict, statDict, geneDict
def VOCprRecord_wrong(gtImages, detlist, show=False, usetr=True, usedf=False, ovr=0.5): """ calculate the precision recall curve """ dimg = {} tot = 0 for idx in range(len(gtImages)): rect = gtImages[idx]["bbox"][:] #if idx>288: # print idx,rect if rect != []: #print gtImages.getImageName(idx).split("/")[-1].split(".")[0] dimg[gtImages[idx]["name"].split("/")[-1].split(".")[0]] = { "bbox": rect, "det": [False] * len(rect) } tot = tot + len(rect) imname = [] cnt = 0 tp = numpy.zeros(len(detlist)) fp = numpy.zeros(len(detlist)) thr = numpy.zeros(len(detlist)) detlist.sort(cmpscore) for idx, detbb in enumerate(detlist): #print detbb[1] found = False maxovr = 0 #gtdet=[False] gt = 0 if dimg.has_key(detbb[0]): rect = dimg[detbb[0]]["bbox"] found = False for ir, r in enumerate(rect): #gtdet.append(False) rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]), float(detbb[4])) #print "GT:",r #print "DET:",rb covr = overlap(rb, r) if covr >= maxovr: maxovr = covr gt = ir #dimg[detbb[0]].remove(r) #found=True #break if maxovr > ovr: #if not(dimg[detbb[0]]["det"][gt]): tp[idx] = 1 #dimg[detbb[0]]["det"][gt]=True #else: # fp[idx]=1 else: fp[idx] = 1 thr[idx] = detbb[1] if show: prec = numpy.sum(tp) / float(numpy.sum(tp) + numpy.sum(fp)) rec = numpy.sum(tp) / tot print "Scr:", detbb[1], "Prec:%.3f" % prec, "Rec:%.3f" % rec ss = raw_input() if ss == "s" or not (found): pylab.ioff() img = gtImages.getImageByName2(detbb[0]) pylab.figure(1) pylab.clf() pylab.imshow(img) rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]), float(detbb[4])) for r in rect: pylab.figure(1) pylab.ioff() box(r[0], r[1], r[2], r[3], 'b', lw=1.5) if found: box(rb[0], rb[1], rb[2], rb[3], 'g', lw=1.5) else: box(rb[0], rb[1], rb[2], rb[3], 'r', lw=1.5) pylab.draw() pylab.show() rect = [] return tp, fp, thr, tot
def viewDet(gtImages, detfile, opt="all", usetr=True, usedf=False, stop=True, t=0.5): detf = open(detfile, "r") detect = detf.readlines() detlst = numpy.zeros((len(detect), 5)) namelst = [] pylab.ioff() for id, el in enumerate(detect): aux = el.split() namelst.append(aux[0]) detlst[id, :] = aux[1:] srt = numpy.argsort(-detlst[:, 0]) imname = [] cnt = 0 ovr = 0.49 #print trPosImages.getTotal() tp = [] fp = [] tot = 0 pylab.figure() bb = numpy.zeros((4)) for id in range(detlst.shape[0]): pylab.ioff() abb = detlst[srt[id]] conf = abb[0] bb[0] = abb[2] bb[1] = abb[1] bb[2] = abb[4] bb[3] = abb[3] pylab.clf() img = gtImages.getImageByName2(namelst[srt[id]]) gtbb = gtImages.getBBoxByName(namelst[srt[id]], usetr=usetr, usedf=usedf) found = False for l in range(len(gtbb)): pylab.imshow(img) pylab.title("%s Confidence: %f" % (namelst[srt[id]], float(conf))) #box(gtbb[l][0],gtbb[l][1],gtbb[l][2],gtbb[l][3],col='b',lw="2") print overlap(bb[:], gtbb[l][:4]) if overlap(bb[:], gtbb[l][:4]) > 0: if overlap(bb[:], gtbb[l][:4]) > ovr: box(gtbb[l][0], gtbb[l][1], gtbb[l][2], gtbb[l][3], col='y', lw="2") box(bb[0], bb[1], bb[2], bb[3], col='g', lw="2") pylab.show() pylab.draw() if stop: raw_input() else: time.sleep(t) found = True else: box(gtbb[l][0], gtbb[l][1], gtbb[l][2], gtbb[l][3], col='y', lw="1") #box(bb[0],bb[1],bb[2],bb[3],col='g',lw="2") #raw_input() else: pass #pylab.imshow(img) #box(bb[0],bb[1],bb[2],bb[3],col='r',lw="2") if not (found): pylab.imshow(img) box(bb[0], bb[1], bb[2], bb[3], col='r', lw="2") pylab.show() pylab.draw() if stop: raw_input() else: time.sleep(t)
def VOCprlistfast(gtImages, detlist, show=False, usetr=True, usedf=False, ovr=0.5): """ calculate the precision recall curve """ dimg = {} tot = 0 for idx in range(gtImages.getTotal()): rect = gtImages.getBBox(idx) if rect != []: dimg[gtImages.getImageName(idx).split("/")[-1].split(".") [0]] = rect tot = tot + len(rect) #print tot imname = [] cnt = 0 tp = numpy.zeros(len(detlist)) fp = numpy.zeros(len(detlist)) detlist.sort(cmpscore) for idx, detbb in enumerate( detlist): #detlist[sortlist]):#gtImages.getTotal()): found = False if dimg.has_key(detbb[0]): rect = dimg[ detbb[0]] #gtImages.getBBox(idx,usetr=usetr,usedf=usedf) #print rect found = False for r in rect: rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]), float(detbb[4])) if overlap(rb, r) >= ovr: dimg[detbb[0]].remove(r) found = True break if found: tp[idx] = 1 #.append(float(detbb[1])) else: fp[idx] = 1 #.append(float(detbb[1])) if show: pylab.ioff() img = gtImages.getImageByName2(detbb[0]) pylab.figure(1) pylab.clf() pylab.imshow(img) rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]), float(detbb[4])) for r in rect: pylab.figure(1) pylab.ioff() box(r[0], r[1], r[2], r[3], 'b', lw=1.5) if found: box(rb[0], rb[1], rb[2], rb[3], 'g', lw=1.5) else: box(rb[0], rb[1], rb[2], rb[3], 'r', lw=1.5) pylab.draw() pylab.show() rect = [] raw_input() return tp, fp, tot
def overlap_clusters2(C): overlaps = [] for i in xrange(len(C)): print "\r%d / %d" % (i, len(C)), sys.stdout.flush() K = [C[i]] for j in xrange(i + 1, len(C)): #fi cj = C[j] for k in xrange(len(K)): ck = K[k] len1 = float(cj[2] - cj[1] + 1) / float(ck[2] - ck[1] + 1) len2 = float(cj[5] - cj[4] + 1) / float(ck[5] - ck[4] + 1) # If one of the two regions # * overlap, and # * have a reasonable similar size, # add it to the overlap region if (((cj[0] == ck[0]) and util.overlap( (cj[1], cj[2]), (ck[1], ck[2])) > 0 and (len1 > 0.3 and len1 < 3)) or ((cj[3] == ck[3]) and util.overlap( (cj[4], cj[5]), (ck[4], ck[5])) > 0 and (len2 > 0.3 and len2 < 3))): K.append(cj) break #fi #efor #efor if len(K) > 1: overlaps.append(K) #fi #efor # Overlapping regions OR = [] # Within an overlap, find unique regions; remove duplicate regions for k in overlaps: R1 = [(c[0], c[1], c[2], 'id_a') for c in k] R2 = [(c[3], c[4], c[5], 'id_b') for c in k] # List of unique regions UR = [] for R in [R1, R2]: for i in xrange(len(R) - 1): mr = R[i] # maximal region if mr == None: continue for j in xrange(i + 1, len(R)): rj = R[j] if rj == None: continue # If the two regions overlap, expand the maximal region if (mr[0] == rj[0] and util.overlap((mr[1], mr[2]), (rj[1], rj[2])) > 0): mr = (mr[0], min(mr[1], rj[1]), max(mr[2], rj[2]), mr[3]) R[j] = None #fi #efor UR.append(mr) #efor #efor OR.append(UR) #efor # Remove duplicates among overlaps (check for subsets) for i in xrange(len(OR) - 1): for j in xrange(i + 1, len(OR)): if sum([1 for k in OR[j] if (k in OR[i])]) >= 0.8 * len(OR): if len(OR[i]) > len(OR[j]): OR[j] = [] else: OR[i] = [] #fi #efor #efor return [x for x in OR if len(x) > 0]
def VOCprRecordthr(gtImages,detlist,show=False,ovr=0.5,pixels=None): """ calculate the precision recall curve """ dimg={} tot=0 posd=[] for idx in range(len(gtImages)): rect=gtImages[idx]["bbox"][:] #if idx>288: # print idx,rect if rect!=[]: #print gtImages.getImageName(idx).split("/")[-1].split(".")[0] dimg[gtImages[idx]["name"].split("/")[-1].split(".")[0]]={"bbox":rect,"det":[False]*len(rect)} for i, recti in enumerate(rect): if recti[5] == 0: tot=tot+1 imname=[] cnt=0 tp=numpy.zeros(len(detlist)) fp=numpy.zeros(len(detlist)) thr=numpy.zeros(len(detlist)) detlist.sort(cmpscore) for idx,detbb in enumerate(detlist): #print detbb[1] found=False maxovr=0 #gtdet=[False] gt=0 if dimg.has_key(detbb[0]): rect=dimg[detbb[0]]["bbox"] found=False for ir,r in enumerate(rect): #gtdet.append(False) rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4])) #print "GT:",r #print "DET:",rb if pixels==None: covr=overlap(rb,r) else: covr=overlapx(rb,r,pixels) if covr>=maxovr: maxovr=covr gt=ir #dimg[detbb[0]].remove(r) #found=True #break if maxovr>ovr: if dimg[detbb[0]]["bbox"][gt][5] == 0: if not(dimg[detbb[0]]["det"][gt]): tp[idx]=1 dimg[detbb[0]]["det"][gt]=True posd.append(detbb[1]) else: fp[idx]=1 else: fp[idx]=1 ########### PASCAL 2010 # if ovmax>=VOCopts.minoverlap # if ~gt(i).diff(jmax) # if ~gt(i).det(jmax) # tp(d)=1; % true positive # gt(i).det(jmax)=true; # else # fp(d)=1; % false positive (multiple detection) # end # end # else # fp(d)=1; % false positive # end ######################## thr[idx]=detbb[1] if show: prec=numpy.sum(tp)/float(numpy.sum(tp)+numpy.sum(fp)) rec=numpy.sum(tp)/tot print "Scr:",detbb[1],"Prec:%.3f"%prec,"Rec:%.3f"%rec ss=raw_input() if ss=="s" or not(found): pylab.ioff() img=gtImages.getImageByName2(detbb[0]) pylab.figure(1) pylab.clf() pylab.imshow(img) rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4])) for r in rect: pylab.figure(1) pylab.ioff() box(r[0],r[1],r[2],r[3],'b',lw=1.5) if found: box(rb[0],rb[1],rb[2],rb[3],'g',lw=1.5) else: box(rb[0],rb[1],rb[2],rb[3],'r',lw=1.5) pylab.draw() pylab.show() rect=[] return tp,fp,thr,tot,posd
def VOCprRecord_wrong(gtImages,detlist,show=False,usetr=True,usedf=False,ovr=0.5): """ calculate the precision recall curve """ dimg={} tot=0 for idx in range(len(gtImages)): rect=gtImages[idx]["bbox"][:] #if idx>288: # print idx,rect if rect!=[]: #print gtImages.getImageName(idx).split("/")[-1].split(".")[0] dimg[gtImages[idx]["name"].split("/")[-1].split(".")[0]]={"bbox":rect,"det":[False]*len(rect)} tot=tot+len(rect) imname=[] cnt=0 tp=numpy.zeros(len(detlist)) fp=numpy.zeros(len(detlist)) thr=numpy.zeros(len(detlist)) detlist.sort(cmpscore) for idx,detbb in enumerate(detlist): #print detbb[1] found=False maxovr=0 #gtdet=[False] gt=0 if dimg.has_key(detbb[0]): rect=dimg[detbb[0]]["bbox"] found=False for ir,r in enumerate(rect): #gtdet.append(False) rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4])) #print "GT:",r #print "DET:",rb covr=overlap(rb,r) if covr>=maxovr: maxovr=covr gt=ir #dimg[detbb[0]].remove(r) #found=True #break if maxovr>ovr: #if not(dimg[detbb[0]]["det"][gt]): tp[idx]=1 #dimg[detbb[0]]["det"][gt]=True #else: # fp[idx]=1 else: fp[idx]=1 thr[idx]=detbb[1] if show: prec=numpy.sum(tp)/float(numpy.sum(tp)+numpy.sum(fp)) rec=numpy.sum(tp)/tot print "Scr:",detbb[1],"Prec:%.3f"%prec,"Rec:%.3f"%rec ss=raw_input() if ss=="s" or not(found): pylab.ioff() img=gtImages.getImageByName2(detbb[0]) pylab.figure(1) pylab.clf() pylab.imshow(img) rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4])) for r in rect: pylab.figure(1) pylab.ioff() box(r[0],r[1],r[2],r[3],'b',lw=1.5) if found: box(rb[0],rb[1],rb[2],rb[3],'g',lw=1.5) else: box(rb[0],rb[1],rb[2],rb[3],'r',lw=1.5) pylab.draw() pylab.show() rect=[] return tp,fp,thr,tot
def viewSortDet(gtImages, detlist, numim=numpy.inf, opt="all", usetr=True, usedf=False, ovr=0.5): dimg = {} tot = 0 for idx in range(min(gtImages.getTotal(), numim)): rect = gtImages.getBBox(idx) if rect != []: #print gtImages.getImageName(idx).split("/")[-1].split(".")[0] dimg[gtImages.getImageName(idx).split("/")[-1].split(".") [0]] = rect tot = tot + len(rect) imname = [] cnt = 0 tp = numpy.zeros(len(detlist)) fp = numpy.zeros(len(detlist)) thr = numpy.zeros(len(detlist)) detlist.sort(cmpscore) for idx, detbb in enumerate(detlist): #print detbb[1] found = False if dimg.has_key(detbb[0]): rect = dimg[detbb[0]] found = False for r in rect: rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]), float(detbb[4])) #print "GT:",r #print "DET:",rb if overlap(rb, r) >= ovr: dimg[detbb[0]].remove(r) found = True break if found: tp[idx] = 1 else: fp[idx] = 1 thr[idx] = detbb[1] if show: pylab.ioff() prec = numpy.sum(tp) / float(numpy.sum(tp) + numpy.sum(fp)) rec = numpy.sum(tp) / tot print "Scr:", detbb[1], "Prec:", prec, "Rec:", rec img = gtImages.getImageByName2(detbb[0]) pylab.figure(1) pylab.clf() pylab.imshow(img) rb = (float(detbb[3]), float(detbb[2]), float(detbb[5]), float(detbb[4])) for r in rect: pylab.figure(1) pylab.ioff() box(r[0], r[1], r[2], r[3], 'b', lw=1.5) if found: box(rb[0], rb[1], rb[2], rb[3], 'g', lw=1.5) else: box(rb[0], rb[1], rb[2], rb[3], 'r', lw=1.5) pylab.draw() pylab.show() rect = [] raw_input() return tp, fp, thr, tot
def VOCanalysis(gtImages,detlist,show=False,usetr=True,usedf=False,ovr=0.5): """ calculate the precision recall curve """ dimg={} tot=0 for idx in range(len(gtImages)): rect=gtImages[idx]["bbox"][:] #if idx>288: # print idx,rect if rect!=[]: #print gtImages.getImageName(idx).split("/")[-1].split(".")[0] dimg[gtImages[idx]["name"].split("/")[-1].split(".")[0]]={"bbox":rect,"det":[False]*len(rect)} tot=tot+len(rect) imname=[] cnt=0 tp=numpy.zeros(len(detlist)) fp=numpy.zeros(len(detlist)) thr=numpy.zeros(len(detlist)) tplist=[] fplist=[] fp2list=[] fnlist=[] detlist.sort(cmpscore) for idx,detbb in enumerate(detlist): #print detbb[1] found=False maxovr=0 #gtdet=[False] gt=0 if dimg.has_key(detbb[0]): rect=dimg[detbb[0]]["bbox"] found=False for ir,r in enumerate(rect): #gtdet.append(False) rb=(float(detbb[3]),float(detbb[2]),float(detbb[5]),float(detbb[4])) #print "GT:",r #print "DET:",rb covr=overlap(rb,r) if covr>=maxovr: maxovr=covr gt=ir #dimg[detbb[0]].remove(r) #found=True #break if maxovr>ovr: if not(dimg[detbb[0]]["det"][gt]): tp[idx]=1 dimg[detbb[0]]["det"][gt]=True tplist.append(detbb) else: fp[idx]=1 fplist.append(detbb) else: fp[idx]=1 fp2list.append(detbb) totalDetected =0 totalnoDetected=0 for idx in range(len(gtImages)): rect=gtImages[idx]["bbox"][:] if rect!=[]: name = gtImages[idx]["name"].split("/")[-1].split(".")[0] bboxgt = dimg[name] for i in range(len(bboxgt["det"])): if bboxgt["det"][i]: #bbox FOUND, it's ok totalDetected += 1 else: #bbox not FOUND, add to FN gtbb = [name,0,bboxgt["bbox"][i][0:4]] fnlist.append(gtbb) totalnoDetected += 1 print "total Detected %d, total no Detected %d"%(totalDetected,totalnoDetected) #tplist.sort(key=lambda det: -det[1]) #fplist.sort(key=lambda det: -det[1]) #fnlist.sort(key=lambda det: -det[1]) return tplist,fplist,fp2list,fnlist
def run(self): if self.options.debubble: self.loadBubbleCircles() #read1_file is required read1_file = fastq.Reader(self.options.read1_file) #create a QC folder to contains QC results qc_base_folder = os.path.join(os.path.dirname(self.options.read1_file), "QC") if not os.path.exists(qc_base_folder): os.makedirs(qc_base_folder) #QC result of this file/pair qc_dir = os.path.join(qc_base_folder, os.path.basename(self.options.read1_file)) if not os.path.exists(qc_dir): os.makedirs(qc_dir) #no front trim if sequence is barcoded if self.options.barcode: self.options.trim_front = 0 reporter = QCReporter() r1qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r2qc_prefilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r1qc_prefilter.statFile(self.options.read1_file) r1qc_prefilter.plot(qc_dir, "R1-prefilter") if self.options.read2_file != None: r2qc_prefilter.statFile(self.options.read2_file) r2qc_prefilter.plot(qc_dir, "R2-prefilter") r1qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) r2qc_postfilter = QualityControl(self.options.qc_sample, self.options.qc_kmer) readLen = r1qc_prefilter.readLen overlap_histgram = [0 for x in xrange(readLen + 1)] distance_histgram = [0 for x in xrange(readLen + 1)] #auto detect trim front and trim tail if self.options.trim_front == -1 or self.options.trim_tail == -1: #auto trim for read1 trimFront, trimTail = r1qc_prefilter.autoTrim() if self.options.trim_front == -1: self.options.trim_front = trimFront if self.options.trim_tail == -1: self.options.trim_tail = trimTail #auto trim for read2 if self.options.read2_file != None: # check if we should keep same trimming for read1/read2 to keep their length identical # this option is on by default because lots of dedup algorithms require this feature if self.options.trim_pair_same: self.options.trim_front2 = self.options.trim_front self.options.trim_tail2 = self.options.trim_tail else: trimFront2, trimTail2 = r2qc_prefilter.autoTrim() if self.options.trim_front2 == -1: self.options.trim_front2 = trimFront2 if self.options.trim_tail2 == -1: self.options.trim_tail2 = trimTail2 print(self.options.read1_file + " options:") print(self.options) #if good output folder not specified, set it as the same folder of read1 file good_dir = self.options.good_output_folder if good_dir == None: good_dir = os.path.dirname(self.options.read1_file) #if bad output folder not specified, set it as the same folder of read1 file bad_dir = self.options.bad_output_folder if bad_dir == None: bad_dir = os.path.dirname(self.options.read1_file) #if overlap output folder not specified, set it as the same folder of read1 file overlap_dir = self.options.overlap_output_folder if overlap_dir == None: overlap_dir = os.path.dirname(self.options.read1_file) if not os.path.exists(good_dir): os.makedirs(good_dir) if not os.path.exists(bad_dir): os.makedirs(bad_dir) if self.options.store_overlap and self.options.read2_file != None and ( not os.path.exists(overlap_dir)): os.makedirs(overlap_dir) good_read1_file = None bad_read1_file = None overlap_read1_file = None if not self.options.qc_only: good_read1_file = fastq.Writer( os.path.join(good_dir, getMainName(self.options.read1_file) + ".good.fq")) bad_read1_file = fastq.Writer( os.path.join(bad_dir, getMainName(self.options.read1_file) + ".bad.fq")) overlap_read1_file = None if self.options.store_overlap: overlap_read1_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.read1_file) + ".overlap.fq")) #other files are optional read2_file = None good_read2_file = None bad_read2_file = None overlap_read2_file = None index1_file = None good_index1_file = None bad_index1_file = None overlap_index1_file = None index2_file = None good_index2_file = None bad_index2_file = None overlap_index2_file = None #if other files are specified, then read them if self.options.read2_file != None: read2_file = fastq.Reader(self.options.read2_file) if not self.options.qc_only: good_read2_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.read2_file) + ".good.fq")) bad_read2_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.read2_file) + ".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_read2_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.read2_file) + ".overlap.fq")) if self.options.index1_file != None: index1_file = fastq.Reader(self.options.index1_file) if not self.options.qc_only: good_index1_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.index1_file) + ".good.fq")) bad_index1_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.index1_file) + ".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_index1_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.index1_file) + ".overlap.fq")) if self.options.index2_file != None: index2_file = fastq.Reader(self.options.index2_file) if not self.options.qc_only: good_index2_file = fastq.Writer( os.path.join( good_dir, getMainName(self.options.index2_file) + ".good.fq")) bad_index2_file = fastq.Writer( os.path.join( bad_dir, getMainName(self.options.index2_file) + ".bad.fq")) if self.options.store_overlap and self.options.read2_file != None: overlap_index2_file = fastq.Writer( os.path.join( overlap_dir, getMainName(self.options.index2_file) + ".overlap.fq")) r1 = None r2 = None i1 = None i2 = None # stat numbers TOTAL = 0 GOOD = 0 BAD = 0 BADBCD1 = 0 BADBCD2 = 0 BADTRIM1 = 0 BADTRIM2 = 0 BADBBL = 0 BADLEN = 0 BADPOL = 0 BADLQC = 0 BADNCT = 0 BADOL = 0 BADINDEL = 0 BADMISMATCH = 0 BASE_CORRECTED = 0 OVERLAPPED = 0 OVERLAP_LEN_SUM = 0 while True: r1 = read1_file.nextRead() if r1 == None: break if read2_file != None: r2 = read2_file.nextRead() if r2 == None: break if index1_file != None: i1 = index1_file.nextRead() if i1 == None: break if index2_file != None: i2 = index2_file.nextRead() if i2 == None: break TOTAL += 1 #barcode processing if self.options.barcode: barcodeLen1 = barcodeprocesser.detectBarcode( r1[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen1 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD1") BADBCD1 += 1 continue else: if r2 == None: barcodeprocesser.moveBarcodeToName( r1, self.options.barcode_length, self.options.barcode_verify) else: barcodeLen2 = barcodeprocesser.detectBarcode( r2[1], self.options.barcode_length, self.options.barcode_verify) if barcodeLen2 == 0: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBCD2") BADBCD2 += 1 continue else: barcodeprocesser.moveAndTrimPair( r1, r2, barcodeLen1, barcodeLen2, self.options.barcode_verify) #trim if self.options.trim_front > 0 or self.options.trim_tail > 0: r1 = trim(r1, self.options.trim_front, self.options.trim_tail) if len(r1[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM1") BADTRIM1 += 1 continue if r2 != None: r2 = trim(r2, self.options.trim_front2, self.options.trim_tail2) if len(r2[1]) < 5: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADTRIM2") BADTRIM2 += 1 continue #filter debubble if self.options.debubble: if self.isInBubble(r1[0]): self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADBBL") BADBBL += 1 continue #filter sequence length if len(r1[1]) < self.options.seq_len_req: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLEN") BADLEN += 1 continue #check polyX if self.options.poly_size_limit > 0: poly1 = hasPolyX(r1[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) poly2 = None if r2 != None: poly2 = hasPolyX(r2[1], self.options.poly_size_limit, self.options.allow_mismatch_in_poly) if poly1 != None or poly2 != None: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADPOL") BADPOL += 1 continue #check low quality count if self.options.unqualified_base_limit > 0: lowQual1 = lowQualityNum(r1, self.options.qualified_quality_phred) lowQual2 = 0 if r2 != None: lowQual2 = lowQualityNum( r2, self.options.qualified_quality_phred) if lowQual1 > self.options.unqualified_base_limit or lowQual1 > self.options.unqualified_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADLQC") BADLQC += 1 continue #check N number if self.options.n_base_limit > 0: nNum1 = nNumber(r1) nNum2 = 0 if r2 != None: nNum2 = nNumber(r2) if nNum1 > self.options.n_base_limit or nNum2 > self.options.n_base_limit: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADNCT") BADNCT += 1 continue #check overlap and do error correction if r2 != None: (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) overlap_histgram[overlap_len] += 1 # deal with the case insert DNA is shorter than read length and cause offset is negative if offset < 0 and overlap_len > 30: # shift the junk bases r1[1] = r1[1][0:overlap_len] r1[3] = r1[3][0:overlap_len] r2[1] = r2[1][-offset:-offset + overlap_len] r2[3] = r2[3][-offset:-offset + overlap_len] # then calc overlap again (offset, overlap_len, distance) = util.overlap(r1[1], r2[1]) if overlap_len > 30: OVERLAPPED += 1 distance_histgram[distance] += 1 OVERLAP_LEN_SUM += overlap_len corrected = 0 if distance > 2: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADOL") BADOL += 1 continue elif distance > 0: #try to fix low quality base hamming = util.hammingDistance( r1[1][len(r1[1]) - overlap_len:], util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) if hamming != distance: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADINDEL") BADINDEL += 1 continue #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) for o in xrange(overlap_len): b1 = r1[1][len(r1[1]) - overlap_len + o] b2 = util.complement(r2[1][-o - 1]) q1 = r1[3][len(r1[3]) - overlap_len + o] q2 = r2[3][-o - 1] if b1 != b2: # print(TOTAL, o, b1, b2, q1, q2) if util.qualNum(q1) >= 27 and util.qualNum( q2) <= 16: r2[1] = util.changeString( r2[1], -o - 1, util.complement(b1)) r2[3] = util.changeString( r2[3], -o - 1, q1) corrected += 1 elif util.qualNum(q2) >= 27 and util.qualNum( q1) <= 16: r1[1] = util.changeString( r1[1], len(r1[1]) - overlap_len + o, b2) r1[3] = util.changeString( r1[3], len(r1[3]) - overlap_len + o, q2) corrected += 1 if corrected >= distance: break #print(r1[1][len(r1[1]) - overlap_len:]) #print(util.reverseComplement(r2[1][len(r2[1]) - overlap_len:])) #print(r1[3][len(r1[1]) - overlap_len:]) #print(util.reverse(r2[3][len(r2[1]) - overlap_len:])) if corrected == distance: BASE_CORRECTED += 1 else: self.writeReads(r1, r2, i1, i2, bad_read1_file, bad_read2_file, bad_index1_file, bad_index2_file, "BADMISMATCH") BADMISMATCH += 1 continue if distance == 0 or distance == corrected: if self.options.store_overlap: self.writeReads(getOverlap(r1, overlap_len), getOverlap(r2, overlap_len), i1, i2, overlap_read1_file, overlap_read2_file, overlap_index1_file, overlap_index2_file, None) #write to good self.writeReads(r1, r2, i1, i2, good_read1_file, good_read2_file, good_index1_file, good_index2_file, None) if self.options.qc_sample <= 0 or TOTAL < self.options.qc_sample: r1qc_postfilter.statRead(r1) if r2 != None: r2qc_postfilter.statRead(r2) GOOD += 1 if self.options.qc_only and TOTAL >= self.options.qc_sample: break r1qc_postfilter.qc() r1qc_postfilter.plot(qc_dir, "R1-postfilter") if self.options.read2_file != None: r2qc_postfilter.qc() r2qc_postfilter.plot(qc_dir, "R2-postfilter") #close all files if not self.options.qc_only: good_read1_file.flush() bad_read1_file.flush() if self.options.read2_file != None: good_read2_file.flush() bad_read2_file.flush() if self.options.index1_file != None: good_index1_file.flush() bad_index1_file.flush() if self.options.index2_file != None: good_index2_file.flush() bad_index2_file.flush() # print stat numbers BAD = TOTAL - GOOD result = {} result['total_reads'] = TOTAL result['good_reads'] = GOOD result['bad_reads'] = BAD result['bad_reads_with_bad_barcode'] = BADBCD1 + BADBCD2 result['bad_reads_with_reads_in_bubble'] = BADBBL result['bad_reads_with_bad_read_length'] = BADLEN + BADTRIM1 + BADTRIM2 result['bad_reads_with_polyX'] = BADPOL result['bad_reads_with_low_quality'] = BADLQC result['bad_reads_with_too_many_N'] = BADNCT result['bad_reads_with_bad_overlap'] = BADOL + BADMISMATCH + BADINDEL # plot result bar figure labels = [ 'good reads', 'has_polyX', 'low_quality', 'too_short', 'too_many_N' ] counts = [GOOD, BADPOL, BADLQC, BADLEN + BADTRIM1 + BADTRIM2, BADNCT] colors = ['#66BB11', '#FF33AF', '#FFD3F2', '#FFA322', '#FF8899'] if self.options.read2_file != None: labels.append('bad_overlap') counts.append(BADOL + BADMISMATCH + BADINDEL) colors.append('#FF6600') if self.options.debubble: labels.append('in_bubble') counts.append(BADBBL) colors.append('#EEBB00') if self.options.barcode: labels.append('bad_barcode') counts.append(BADBCD1 + BADBCD2) colors.append('#CCDD22') for i in xrange(len(counts)): labels[i] = labels[i] + ": " + str(counts[i]) + "(" + str( 100.0 * float(counts[i]) / TOTAL) + "%)" fig = plt.figure(1) plt.title("Filtering statistics of sampled " + str(TOTAL) + " reads", fontsize=12, color='#666666') plt.axis('equal') patches, texts = plt.pie(counts, colors=colors, radius=0.7) patches, labels, dummy = zip(*sorted( zip(patches, labels, counts), key=lambda x: x[2], reverse=True)) plt.legend(patches, labels, loc='upper left', fontsize=9) plt.savefig(os.path.join(qc_dir, "filter-stat.png"), bbox_inches='tight') plt.close(1) stat = {} # stat["options"]=self.options stat["summary"] = result stat["command"] = makeDict(self.options) stat["kmer_content"] = {} stat["kmer_content"]["read1_prefilter"] = r1qc_prefilter.topKmerCount[ 0:10] stat["kmer_content"][ "read1_postfilter"] = r1qc_postfilter.topKmerCount[0:10] if self.options.read2_file != None: stat["kmer_content"][ "read2_prefilter"] = r2qc_prefilter.topKmerCount[0:10] stat["kmer_content"][ "read2_postfilter"] = r2qc_postfilter.topKmerCount[0:10] stat["overlap"] = {} stat["overlap"]['overlapped_pairs'] = OVERLAPPED if OVERLAPPED > 0: stat["overlap"]['average_overlap_length'] = float( OVERLAP_LEN_SUM / OVERLAPPED) else: stat["overlap"]['average_overlap_length'] = 0.0 stat["overlap"]['bad_edit_distance'] = BADOL stat["overlap"]['bad_mismatch_bases'] = BADMISMATCH stat["overlap"]['bad_indel'] = BADINDEL stat["overlap"][ 'reads_with_corrected_mismatch_bases'] = BASE_CORRECTED stat["overlap"][ 'overlapped_area_edit_distance_histogram'] = distance_histgram[ 0:10] plotOverlapHistgram(overlap_histgram, readLen, TOTAL, os.path.join(qc_dir, "overlap.png")) stat_file = open(os.path.join(qc_dir, "after.json"), "w") stat_json = json.dumps(stat, sort_keys=True, indent=4, separators=(',', ': ')) stat_file.write(stat_json) stat_file.close() self.addFiguresToReport(reporter) reporter.output(os.path.join(qc_dir, "report.html"))
def setComplementSequence(self, sequenceString, strand): """ This version takes anothers strand and only sets the indices that align with the given complimentary strand return the used portion of the sequenceString As it depends which direction this is going, and strings are stored in memory left to right, we need to test for isDrawn5to3 to map the reverse compliment appropriately, as we traverse overlapping strands. We reverse the sequence ahead of time if we are applying it 5' to 3', otherwise we reverse the sequence post parsing if it's 3' to 5' Again, sequences are stored as strings in memory 5' to 3' so we need to jump through these hoops to iterate 5' to 3' through them correctly Perhaps it's wiser to merely store them left to right and reverse them at draw time, or export time """ sLowIdx, sHighIdx = self._baseIdxLow, self._baseIdxHigh cLowIdx, cHighIdx = strand.idxs() # get the ovelap lowIdx, highIdx = util.overlap(sLowIdx, sHighIdx, cLowIdx, cHighIdx) # only get the characters we're using, while we're at it, make it the # reverse compliment totalLength = self.totalLength() # see if we are applying if sequenceString is None: # clear out string for in case of not total overlap useSeq = ''.join([' ' for x in range(totalLength)]) else: # use the string as is useSeq = sequenceString[::-1] if self._isDrawn5to3 \ else sequenceString temp = array(ARRAY_TYPE, sixb(useSeq)) if self._sequence is None: tempSelf = array(ARRAY_TYPE, sixb(''.join([' ' for x in range(totalLength)]))) else: tempSelf = array(ARRAY_TYPE, sixb(self._sequence) if self._isDrawn5to3 \ else sixb(self._sequence[::-1])) # generate the index into the compliment string a = self.insertionLengthBetweenIdxs(sLowIdx, lowIdx - 1) b = self.insertionLengthBetweenIdxs(lowIdx, highIdx) c = strand.insertionLengthBetweenIdxs(cLowIdx, lowIdx - 1) start = lowIdx - cLowIdx + c end = start + b + highIdx - lowIdx + 1 tempSelf[lowIdx - sLowIdx + a:highIdx - sLowIdx + 1 + a + b] = temp[start:end] # print "old sequence", self._sequence self._sequence = tostring(tempSelf) # if we need to reverse it do it now if not self._isDrawn5to3: self._sequence = self._sequence[::-1] # test to see if the string is empty(), annoyingly expensive if len(self._sequence.strip()) == 0: self._sequence = None # print "new sequence", self._sequence return self._sequence
def VOCprlist(gtImages,detlist,show=False,usetr=True,usedf=False,ovr=0.5): """ calculate the precision recall curve """ #detf=open(detfile,"r") #detect=detf.readlines() imname=[] cnt=0 #ovr=0.49 #print trPosImages.getTotal() tp=[] fp=[] tot=0 for idx in range(gtImages.getTotal()): print gtImages.getImageName(idx) if show: img=gtImages.getImage(idx) pylab.figure(1) pylab.clf() pylab.imshow(img) #pyr=HOGcompute.HOGcrop(img,interv=interv) #pyr.pad() #pyr.pad() #pyr.contrast() rect=gtImages.getBBox(idx,usetr=usetr,usedf=usedf) print rect if show: for r in rect: pylab.figure(1) pylab.ioff() box(r[0],r[1],r[2],r[3],'b',lw=1.5) #raw_input() tot=tot+len(rect) #print len(rect),rect #print rect for l in detlist: data=l#.split(" ") if data[0]==gtImages.getImageName(idx).split("/")[-1].split(".")[0]: notfound=True rb=[float(data[3]),float(data[2]),float(data[5]),float(data[4])] if show: pylab.ioff() pylab.text(rb[1],rb[0],data[1]) for id,r in enumerate(rect): #pylab.figure(1) #box(r[0],r[1],r[2],r[3],'b',lw=1.5) #print "entered",data #rb=[float(data[3]),float(data[2]),float(data[5]),float(data[4])] #print rb,r,overlap(rb,r) #pylab.text(rb[1],rb[0],data[1]) if overlap(rb,r)>=ovr: if show: pylab.ioff() box(rb[0],rb[1],rb[2],rb[3],'g',lw=1.5) del rect[id] tp.append(float(data[1])) notfound=False break if notfound==True: if show: pylab.ioff() box(rb[0],rb[1],rb[2],rb[3],'r',lw=1) fp.append(float(data[1])) #print len(tp),len(fp),tot #break if show: pylab.figure(1) pylab.show() pylab.draw() #raw_input() return tp,fp,tot
def setComplementSequence(self, sequenceString, strand): """ This version takes anothers strand and only sets the indices that align with the given complimentary strand return the used portion of the sequenceString As it depends which direction this is going, and strings are stored in memory left to right, we need to test for isDrawn5to3 to map the reverse compliment appropriately, as we traverse overlapping strands. We reverse the sequence ahead of time if we are applying it 5' to 3', otherwise we reverse the sequence post parsing if it's 3' to 5' Again, sequences are stored as strings in memory 5' to 3' so we need to jump through these hoops to iterate 5' to 3' through them correctly Perhaps it's wiser to merely store them left to right and reverse them at draw time, or export time """ sLowIdx, sHighIdx = self._baseIdxLow, self._baseIdxHigh cLowIdx, cHighIdx = strand.idxs() # get the ovelap lowIdx, highIdx = util.overlap(sLowIdx, sHighIdx, cLowIdx, cHighIdx) # only get the characters we're using, while we're at it, make it the # reverse compliment totalLength = self.totalLength() # see if we are applying if sequenceString == None: # clear out string for in case of not total overlap useSeq = ''.join([' ' for x in range(totalLength)]) else: # use the string as is useSeq = sequenceString[::-1] if self._isDrawn5to3 \ else sequenceString temp = array('c', useSeq) if self._sequence == None: tempSelf = array('c', ''.join([' ' for x in range(totalLength)])) else: tempSelf = array('c', self._sequence if self._isDrawn5to3 \ else self._sequence[::-1]) # generate the index into the compliment string a = self.insertionLengthBetweenIdxs(sLowIdx, lowIdx - 1) b = self.insertionLengthBetweenIdxs(lowIdx, highIdx) c = strand.insertionLengthBetweenIdxs(cLowIdx, lowIdx - 1) start = lowIdx - cLowIdx + c end = start + b + highIdx - lowIdx + 1 tempSelf[lowIdx - sLowIdx + a:highIdx - sLowIdx + 1 + a + b] = \ temp[start:end] # print "old sequence", self._sequence self._sequence = tempSelf.tostring() # if we need to reverse it do it now if not self._isDrawn5to3: self._sequence = self._sequence[::-1] # test to see if the string is empty(), annoyingly expensive if len(self._sequence.strip()) == 0: self._sequence = None # print "new sequence", self._sequence return self._sequence