def get_binned_counts(bams, bin_size, regions=None):

    fs = 1000
    if regions is None:
        regions = [(c,None,None) for c in get_contig_sizes(bams[0]).keys()]

    else:
        for i,r in enumerate(regions):
            if type(r)==str:
                regions[i] = (r,None,None)
            else:
                contig, start, end =r
                if type(start)==int:
                    start = max(0,start-fs)

                regions[i] = (contig,start,end)

    jobs = [(bam_path, bin_size, *region) for region, bam_path in product(regions, bams)]


    cut_counts = defaultdict(Counter)
    with Pool() as workers:

        for i, (cc, contig, bam_path) in enumerate(workers.imap(_generate_count_dict,jobs)):

            for k,v in cc.items():
                cut_counts[k] += v

            print(i,'/', len(jobs), end='\r')

    return pd.DataFrame(cut_counts).T
def blacklisted_binning_contigs(contig_length_resource: str,
                                bin_size: int,
                                fragment_size: int,
                                blacklist_path: str = None,
                                contig_whitelist: list = None) -> Generator:
    """
    Generate a list of (contig, bin_start, bin_end) tuples of size bin_size or smaller or when fragment_size is supplied
    (contig, bin_start, bin_end, fetch_start, fetch_end). All regions present in the blacklist BED file will not be
    part of the generated bins.

    Args:
        contig_length_resource(str): Path to bam file from which to extract the contig lengths
        bin_size(int) : maximum size of generated bins (might produce some bins which are smaller)
        fragment_size(int) : When this value is supplied fetch_start, fetch_end will be produced which will be equal to
                            bin_start-fragment_size and bin_end+fragment size. But will never overlap with blacklisted
                            regions or exceed contig boundaries.
        blacklist_path(str): path to blacklist bed file
        contig_whitelist(iterable): A set of contigs to only include in the result. All contigs are included when
                                    contig_whitelist is not specified.

    Returns:
        bin_tuples(Generator): (contig, bin_start, bin_end),
                               ( contig, bin_start, bin_end, fetch_start, fetch_end ) when fragment_size is specified
    """

    if blacklist_path is not None:
        blacklist_dict = get_bins_from_bed_dict(blacklist_path)
    else:
        blacklist_dict = {}

    for contig, length in (get_contig_sizes(contig_length_resource).items()
                           if type(contig_length_resource) is str else
                           contig_length_resource):
        if contig_whitelist is not None and not contig in contig_whitelist:
            continue

        if fragment_size is not None:
            for bin_start, bin_end, fetch_start, fetch_end in \
                    blacklisted_binning(
                        start_coord=0,
                        end_coord=length,
                        bin_size=bin_size,
                        blacklist=sorted(blacklist_dict.get(contig, [])),
                        fragment_size=fragment_size):
                yield contig, bin_start, bin_end, fetch_start, fetch_end
        else:
            for bin_start, bin_end in \
                    blacklisted_binning(
                        start_coord=0,
                        end_coord=length,
                        bin_size=bin_size,
                        blacklist=sorted(blacklist_dict.get(contig, []))):
                yield contig, bin_start, bin_end
    if norm_method == 'median':
        corrected_cells = ((corrected_cells.T / corrected_cells.median(1)) *
                           2).T
    elif norm_method == 'mean':
        corrected_cells = ((corrected_cells.T / corrected_cells.mean(1)) * 2).T
    else:
        raise ValueError('norm_method not understood')

    return corrected_cells


def generate_jobs(alignments_path, bin_size=1_000_000, bins_per_job=10):
    for job_group in (
        ((contig, start, start + bin_size * bins_per_job)
         for start in range(0, length, bin_size * bins_per_job))
            for contig, length in get_contig_sizes(alignments_path).items()):
        yield from job_group


def generate_commands(alignments_path,
                      bin_size=1_000_000,
                      bins_per_job=10,
                      alt_spans=None,
                      min_mq=50,
                      max_fragment_size=1000,
                      head=None,
                      key_tags=None,
                      dedup=True,
                      kwargs=None,
                      skip_contigs=None):
Ejemplo n.º 4
0
        corrected_cells = list( workers.imap(
            gc_correct, [(row,gc_vector.values,MAXCP) for cell,row in df.iterrows()] ))

    corrected_cells = pd.concat(corrected_cells,axis=1).T
    corrected_cells = ((corrected_cells.T/corrected_cells.median(1))*2).T

    return corrected_cells



def generate_jobs(alignments_path, bin_size = 1_000_000, bins_per_job = 10):
    for job_group in (((contig, start, start+bin_size*bins_per_job)
                 for start in range(0,length,bin_size*bins_per_job))
                for contig,length in
                get_contig_sizes(alignments_path).items()):
        yield from job_group

def generate_commands(alignments_path, bin_size = 1_000_000, bins_per_job = 10,alt_spans=None, min_mq=50,max_fragment_size=1000, head=None,key_tags=None,dedup=True):
    for i,(contig,start,end) in enumerate(generate_jobs(alignments_path=alignments_path,bin_size=bin_size,bins_per_job=bins_per_job)):
        yield (alignments_path, bin_size, max_fragment_size, \
                               contig, start, end, \
                               min_mq,alt_spans,key_tags,dedup)
        if head is not None and i>=(head-1):
            break

def count_fragments_binned(args):
    (alignments_path, bin_size, max_fragment_size, \
                           contig, start, end, \
                           min_mq,alt_spans, key_tags,dedup) = args