def get_binned_counts(bams, bin_size, regions=None): fs = 1000 if regions is None: regions = [(c,None,None) for c in get_contig_sizes(bams[0]).keys()] else: for i,r in enumerate(regions): if type(r)==str: regions[i] = (r,None,None) else: contig, start, end =r if type(start)==int: start = max(0,start-fs) regions[i] = (contig,start,end) jobs = [(bam_path, bin_size, *region) for region, bam_path in product(regions, bams)] cut_counts = defaultdict(Counter) with Pool() as workers: for i, (cc, contig, bam_path) in enumerate(workers.imap(_generate_count_dict,jobs)): for k,v in cc.items(): cut_counts[k] += v print(i,'/', len(jobs), end='\r') return pd.DataFrame(cut_counts).T
def blacklisted_binning_contigs(contig_length_resource: str, bin_size: int, fragment_size: int, blacklist_path: str = None, contig_whitelist: list = None) -> Generator: """ Generate a list of (contig, bin_start, bin_end) tuples of size bin_size or smaller or when fragment_size is supplied (contig, bin_start, bin_end, fetch_start, fetch_end). All regions present in the blacklist BED file will not be part of the generated bins. Args: contig_length_resource(str): Path to bam file from which to extract the contig lengths bin_size(int) : maximum size of generated bins (might produce some bins which are smaller) fragment_size(int) : When this value is supplied fetch_start, fetch_end will be produced which will be equal to bin_start-fragment_size and bin_end+fragment size. But will never overlap with blacklisted regions or exceed contig boundaries. blacklist_path(str): path to blacklist bed file contig_whitelist(iterable): A set of contigs to only include in the result. All contigs are included when contig_whitelist is not specified. Returns: bin_tuples(Generator): (contig, bin_start, bin_end), ( contig, bin_start, bin_end, fetch_start, fetch_end ) when fragment_size is specified """ if blacklist_path is not None: blacklist_dict = get_bins_from_bed_dict(blacklist_path) else: blacklist_dict = {} for contig, length in (get_contig_sizes(contig_length_resource).items() if type(contig_length_resource) is str else contig_length_resource): if contig_whitelist is not None and not contig in contig_whitelist: continue if fragment_size is not None: for bin_start, bin_end, fetch_start, fetch_end in \ blacklisted_binning( start_coord=0, end_coord=length, bin_size=bin_size, blacklist=sorted(blacklist_dict.get(contig, [])), fragment_size=fragment_size): yield contig, bin_start, bin_end, fetch_start, fetch_end else: for bin_start, bin_end in \ blacklisted_binning( start_coord=0, end_coord=length, bin_size=bin_size, blacklist=sorted(blacklist_dict.get(contig, []))): yield contig, bin_start, bin_end
if norm_method == 'median': corrected_cells = ((corrected_cells.T / corrected_cells.median(1)) * 2).T elif norm_method == 'mean': corrected_cells = ((corrected_cells.T / corrected_cells.mean(1)) * 2).T else: raise ValueError('norm_method not understood') return corrected_cells def generate_jobs(alignments_path, bin_size=1_000_000, bins_per_job=10): for job_group in ( ((contig, start, start + bin_size * bins_per_job) for start in range(0, length, bin_size * bins_per_job)) for contig, length in get_contig_sizes(alignments_path).items()): yield from job_group def generate_commands(alignments_path, bin_size=1_000_000, bins_per_job=10, alt_spans=None, min_mq=50, max_fragment_size=1000, head=None, key_tags=None, dedup=True, kwargs=None, skip_contigs=None):
corrected_cells = list( workers.imap( gc_correct, [(row,gc_vector.values,MAXCP) for cell,row in df.iterrows()] )) corrected_cells = pd.concat(corrected_cells,axis=1).T corrected_cells = ((corrected_cells.T/corrected_cells.median(1))*2).T return corrected_cells def generate_jobs(alignments_path, bin_size = 1_000_000, bins_per_job = 10): for job_group in (((contig, start, start+bin_size*bins_per_job) for start in range(0,length,bin_size*bins_per_job)) for contig,length in get_contig_sizes(alignments_path).items()): yield from job_group def generate_commands(alignments_path, bin_size = 1_000_000, bins_per_job = 10,alt_spans=None, min_mq=50,max_fragment_size=1000, head=None,key_tags=None,dedup=True): for i,(contig,start,end) in enumerate(generate_jobs(alignments_path=alignments_path,bin_size=bin_size,bins_per_job=bins_per_job)): yield (alignments_path, bin_size, max_fragment_size, \ contig, start, end, \ min_mq,alt_spans,key_tags,dedup) if head is not None and i>=(head-1): break def count_fragments_binned(args): (alignments_path, bin_size, max_fragment_size, \ contig, start, end, \ min_mq,alt_spans, key_tags,dedup) = args