def find_nocov_variants(covlist,chrom='',caller='',min_cov=5): variants = [] assert min(covlist[1:]) >= 0 nocov = [i for i,v in enumerate(covlist) if v < min_cov] nocov.remove(0) # take off the -1 at index 0 if len(covlist)-1 == len(nocov): return None # entire sequence has no coverage nocov_intervals = list(intervals(nocov)) for iv in nocov_intervals: data = {'chrom':chrom,'caller':caller,'pos':iv[0], 'type': 'no_cov'} data['length'] = iv[1] - iv[0] + 1 data['mean_cov'] = scipy.mean(covlist[iv[0]:(iv[1]+1)]) variants.append(Variant.from_dict(data)) return variants
def find_variants(covlist, seq, chrom, min_cov=5, min_score=30, exclude_edges=False, exclude_overlaps=False): ''' identify coverage variants in covlist Returns dict with keys 'mean_cov','pct_cov', and 'variants', where dict['variants'] is a list of Variant objects ''' assert min(covlist[1:]) >= 0 assert len(covlist) - 1 == len( seq ), "Number of coverage values (%d) is not equal to sequence length (%d)" % ( len(covlist) - 1, len(seq)) retval = {} nocov = [i for i, v in enumerate(covlist) if v < min_cov] nocov.remove(0) retval['mean_cov'] = scipy.mean(covlist[1:]) retval['pct_cov'] = 1 - (float(len(nocov)) / (len(covlist) - 1)) if len(nocov) == len(seq): return retval nocov_intervals = list(intervals(nocov)) #covscores,localmeans = local_coverage_score(covlist) covscores, localmeans = adjusted_coverage_score(covlist) covdip = [i for i, v in enumerate(covscores) if v >= min_score] covdip_intervals = list(intervals(covdip)) # refine intervals if exclude_edges: # ignore intervals that overlap the beginning and end of reference covdip_intervals = [ iv for iv in covdip_intervals if not iv[0] == 1 and not iv[1] == (len(covlist) - 1) ] if exclude_overlaps: # ignore covdip intervals that overlap with nocov intervals covdip_intervals = remove_overlap(covdip_intervals, nocov_intervals) # covdip = list(itertools.chain(*[range(v1,v2+1) for v1,v2 in covdip_intervals]) # positions with no coverage are not considered to be coverage dips covdip = [p for p in covdip if p not in nocov] variants = [] for iv in nocov_intervals: data = {'chrom': chrom, 'pos': iv[0], 'type': 'no_cov'} data['length'] = iv[1] - iv[0] + 1 data['mean_cov'] = scipy.mean(covlist[iv[0]:(iv[1] + 1)]) variants.append(Variant.from_dict(data)) for iv in covdip_intervals: data = {'chrom': chrom, 'pos': iv[0], 'type': 'cov_dip'} data['length'] = iv[1] - iv[0] + 1 data['mean_cov'] = scipy.mean(covlist[iv[0]:(iv[1] + 1)]) intscores = covscores[iv[0]:(iv[1] + 1)] intmeans = localmeans[iv[0]:(iv[1] + 1)] data['quality'] = max(intscores) data['info'] = { 'CovScores': '%s' % ','.join(['%d' % int(round(v)) for v in intscores]), 'LocalMeans': '%s' % ','.join(['%d' % int(round(v)) for v in intmeans]), } data['ref'] = str(seq[iv[0]:(iv[1] + 1)].seq).upper() # data['alt'] = data['ref'].lower() variants.append(Variant.from_dict(data)) if variants: retval['variants'] = variants return retval
def find_variants(covlist, seq, chrom, min_cov=5, min_score=30, exclude_edges=False, exclude_overlaps=False): ''' identify coverage variants in covlist Returns dict with keys 'mean_cov','pct_cov', and 'variants', where dict['variants'] is a list of Variant objects ''' assert min(covlist[1:]) >= 0 assert len(covlist) - 1 == len(seq), "Number of coverage values (%d) is not equal to sequence length (%d)" % (len(covlist)-1,len(seq)) retval = {} nocov = [i for i,v in enumerate(covlist) if v < min_cov] nocov.remove(0) retval['mean_cov'] = scipy.mean(covlist[1:]) retval['pct_cov'] = 1 - (float(len(nocov)) / (len(covlist) - 1)) if len(nocov) == len(seq): return retval nocov_intervals = list(intervals(nocov)) #covscores,localmeans = local_coverage_score(covlist) covscores,localmeans = adjusted_coverage_score(covlist) covdip = [i for i,v in enumerate(covscores) if v >= min_score] covdip_intervals = list(intervals(covdip)) # refine intervals if exclude_edges: # ignore intervals that overlap the beginning and end of reference covdip_intervals = [iv for iv in covdip_intervals if not iv[0]==1 and not iv[1]==(len(covlist)-1)] if exclude_overlaps: # ignore covdip intervals that overlap with nocov intervals covdip_intervals = remove_overlap(covdip_intervals,nocov_intervals) # covdip = list(itertools.chain(*[range(v1,v2+1) for v1,v2 in covdip_intervals]) # positions with no coverage are not considered to be coverage dips covdip = [p for p in covdip if p not in nocov] variants = [] for iv in nocov_intervals: data = {'chrom':chrom, 'pos':iv[0], 'type': 'no_cov'} data['length'] = iv[1] - iv[0] + 1 data['mean_cov'] = scipy.mean(covlist[iv[0]:(iv[1]+1)]) variants.append(Variant.from_dict(data)) for iv in covdip_intervals: data = {'chrom':chrom, 'pos':iv[0], 'type': 'cov_dip'} data['length'] = iv[1] - iv[0] + 1 data['mean_cov'] = scipy.mean(covlist[iv[0]:(iv[1]+1)]) intscores = covscores[iv[0]:(iv[1]+1)] intmeans = localmeans[iv[0]:(iv[1]+1)] data['quality'] = max(intscores) data['info'] = {'CovScores':'%s' % ','.join(['%d' % int(round(v)) for v in intscores]), 'LocalMeans':'%s' % ','.join(['%d' % int(round(v)) for v in intmeans]), } data['ref'] = str(seq[iv[0]:(iv[1]+1)].seq).upper() # data['alt'] = data['ref'].lower() variants.append(Variant.from_dict(data)) if variants: retval['variants'] = variants return retval