def _merge_by_batch(batch, fnames): """Merge all calls in a family into a single callset. """ merge_dir = utils.safe_makedir(os.path.join(os.getcwd(), "merged")) clean_dir = utils.safe_makedir(os.path.join(merge_dir, "clean")) merge_file = os.path.join(merge_dir, "%s-ensemble.bed" % batch) if not utils.file_uptodate(merge_file, fnames[0]): for fname in glob.glob(os.path.join(merge_dir, "%s-ensemble*" % batch)): os.remove(fname) ensemble.combine_bed_by_size(fnames, batch, merge_dir, {}, delim="&&") return bedutils.clean_file(merge_file, {}, bedprep_dir=clean_dir)
def _combine_calls(in_files, sample_info): sample_files = {} sample_dir = utils.safe_makedir(os.path.join(os.getcwd(), "prepped")) files_by_batch = collections.defaultdict(list) for fname in in_files: descr = os.path.basename(fname).split("-")[0] sample_files[descr] = fname cur_file = os.path.join(sample_dir, os.path.basename(fname)) files_by_batch[sample_info[descr]["batch"]].append(cur_file) if not utils.file_uptodate(cur_file, fname): with open(cur_file, "w") as out_handle: with open(fname) as in_handle: for line in in_handle: chrom, start, end, calls = line.strip().split("\t")[:4] info = {"sample": descr, "phenotype": sample_info[descr]["phenotype"], "risk": sample_info[descr]["risk"], "calls": calls.split(","), "region": "%s:%s-%s" % (chrom, start, end)} is_cnv = any([x.startswith("cnv") for x in info["calls"]]) if int(end) - int(start) >= MIN_SIZE and (is_cnv or int(end) - int(start) < MAX_SV_SIZE): out_handle.write("\t".join([chrom, start, end, json.dumps(info)]) + "\n") samples_by_batch = collections.defaultdict(list) for s, x in sample_info.items(): x["sample"] = s samples_by_batch[x["batch"]].append(x) ready_files = [] for batch, fnames in files_by_batch.items(): merged_file = _merge_by_batch(batch, fnames) filtered_file = _filter_batch(merged_file, samples_by_batch[batch]) ready_files.append(filtered_file) return ensemble.combine_bed_by_size(ready_files, "combined", os.getcwd(), {}), sample_files
def _combine_calls(in_files, sample_info): sample_files = {} sample_dir = utils.safe_makedir(os.path.join(os.getcwd(), "prepped")) files_by_batch = collections.defaultdict(list) for fname in in_files: descr = os.path.basename(fname).split("-")[0] sample_files[descr] = fname cur_file = os.path.join(sample_dir, os.path.basename(fname)) files_by_batch[sample_info[descr]["batch"]].append(cur_file) if not utils.file_uptodate(cur_file, fname): with open(cur_file, "w") as out_handle: with open(fname) as in_handle: for line in in_handle: chrom, start, end, calls = line.strip().split("\t")[:4] info = { "sample": descr, "phenotype": sample_info[descr]["phenotype"], "risk": sample_info[descr]["risk"], "calls": calls.split(","), "region": "%s:%s-%s" % (chrom, start, end) } is_cnv = any( [x.startswith("cnv") for x in info["calls"]]) if int(end) - int(start) >= MIN_SIZE and ( is_cnv or int(end) - int(start) < MAX_SV_SIZE): out_handle.write("\t".join( [chrom, start, end, json.dumps(info)]) + "\n") samples_by_batch = collections.defaultdict(list) for s, x in sample_info.items(): x["sample"] = s samples_by_batch[x["batch"]].append(x) ready_files = [] for batch, fnames in files_by_batch.items(): merged_file = _merge_by_batch(batch, fnames) filtered_file = _filter_batch(merged_file, samples_by_batch[batch]) ready_files.append(filtered_file) return ensemble.combine_bed_by_size(ready_files, "combined", os.getcwd(), {}), sample_files