def getChunkLength(args, chromSize): """ There's no point in parsing the GTF time over and over again needlessly. Emprically, it seems that adding ~4x the number of workers is ideal, since coverage is non-uniform. This is a heuristic way of approximating that. Note that if there are MANY small contigs and a few large ones (e.g., the max and median lengths are >10x different, then it's best to take a different tack. """ if args.region: chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, args.region) rv = np.ceil((region_start - region_end) / float(4 * args.numberOfProcessors)).astype(int) return max(1, rv) bl = None if args.blackListFileName: bl = GTF(args.blackListFileName) lengths = [] for k, v in chromSize: regs = blSubtract(bl, k, [0, v]) for reg in regs: lengths.append(reg[1] - reg[0]) if len(lengths) >= 4 * args.numberOfProcessors: rv = np.median(lengths).astype(int) # In cases like dm6 or GRCh38, there are a LOT of really small contigs, which will cause the median to be small and performance to tank if np.max(lengths) >= 10 * rv: rv = np.ceil(np.sum(lengths) / (4.0 * args.numberOfProcessors)).astype(int) else: rv = np.ceil(np.sum(lengths) / (4.0 * args.numberOfProcessors)).astype(int) return max(1, rv)